From 86de56487e5f0017ffd5930b0dbd9dda43048849 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 30 Jul 2025 11:58:52 -0700 Subject: bpf: Allow syscall bpf programs to call non-recur helpers Allow syscall programs to call non-recur helpers too since syscall bpf programs runs in process context through bpf syscall, BPF_PROG_TEST_RUN, and cannot run recursively. bpf_task_storage_{get,set} have "_recur" versions that call trylock instead of taking the lock directly to avoid deadlock when called by bpf programs that run recursively. Currently, only bpf_lsm, bpf_iter, struct_ops without private stack are allow to call the non-recur helpers since they cannot be recursively called in another bpf program. Signed-off-by: Amery Hung Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20250730185903.3574598-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 94defa405c85..c823f8efe3ed 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -962,6 +962,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) case BPF_PROG_TYPE_STRUCT_OPS: return prog->aux->jits_use_priv_stack; case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_SYSCALL: return false; default: return true; -- cgit v1.2.3 From d87a513d093726d121dd5c816e26803111a259d0 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 6 Aug 2025 09:25:38 -0700 Subject: bpf: Allow struct_ops to get map id by kdata Add bpf_struct_ops_id() to enable struct_ops implementors to use struct_ops map id as the unique id of a struct_ops in their subsystem. A subsystem that wishes to create a mapping between id and struct_ops instance pointer can update the mapping accordingly during bpf_struct_ops::reg(), unreg(), and update(). Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250806162540.681679-2-ameryhung@gmail.com --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc700925b802..e7ee089e8a31 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1985,6 +1985,7 @@ static inline void bpf_module_put(const void *data, struct module *owner) module_put(owner); } int bpf_struct_ops_link_create(union bpf_attr *attr); +u32 bpf_struct_ops_id(const void *kdata); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 687a3e9c76f5..a41e6730edcf 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -1174,6 +1174,18 @@ void bpf_struct_ops_put(const void *kdata) bpf_map_put(&st_map->map); } +u32 bpf_struct_ops_id(const void *kdata) +{ + struct bpf_struct_ops_value *kvalue; + struct bpf_struct_ops_map *st_map; + + kvalue = container_of(kdata, struct bpf_struct_ops_value, data); + st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); + + return st_map->map.id; +} +EXPORT_SYMBOL_GPL(bpf_struct_ops_id); + static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; -- cgit v1.2.3 From 17e8b7e08fa8bf7a936f70444a42a88750410251 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jul 2025 09:48:54 +0200 Subject: fs: mark file_remove_privs_flags static file_remove_privs_flags is only used inside of inode.c, mark it static. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250724074854.3316911-1-hch@lst.de Signed-off-by: Christian Brauner --- fs/inode.c | 3 +-- include/linux/fs.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/inode.c b/fs/inode.c index 01ebdc40021e..3cbb78412a3e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2189,7 +2189,7 @@ static int __remove_privs(struct mnt_idmap *idmap, return notify_change(idmap, dentry, &newattrs, NULL); } -int file_remove_privs_flags(struct file *file, unsigned int flags) +static int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); @@ -2214,7 +2214,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..796319914b0a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3393,7 +3393,6 @@ static inline struct inode *new_inode_pseudo(struct super_block *sb) extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); -extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); -- cgit v1.2.3 From 4e021920812d164bb02c30cc40e08a3681b1c755 Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Wed, 30 Jul 2025 20:18:53 +0000 Subject: fs: document 'name' parameter for name_contains_dotdot() The kernel-doc for name_contains_dotdot() was missing the @name parameter description, leading to a warning during make htmldocs. Add the missing documentation to resolve this warning. Signed-off-by: Kriish Sharma Link: https://lore.kernel.org/20250730201853.8436-1-kriish.sharma2006@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 796319914b0a..780e9c774c54 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3281,7 +3281,7 @@ static inline bool is_dot_dotdot(const char *name, size_t len) /** * name_contains_dotdot - check if a file name contains ".." path components - * + * @name: File path string to check * Search for ".." surrounded by either '/' or start/end of string. */ static inline bool name_contains_dotdot(const char *name) -- cgit v1.2.3 From 56ecfd9175b999dfc303ac6a0f9ea4bd1bee49d7 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 23 Jul 2025 14:21:54 +0100 Subject: fs: Remove mount_nodev mount_nodev has had no in-tree users since cc0876f817d6 ("vfs: Convert devpts to use the new mount API"). Remove it. Signed-off-by: Pedro Falcato Link: https://lore.kernel.org/20250723132156.225410-2-pfalcato@suse.de Signed-off-by: Christian Brauner --- fs/super.c | 20 -------------------- include/linux/fs.h | 3 --- 2 files changed, 23 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 7f876f32343a..7daa20737f2e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1773,26 +1773,6 @@ void kill_block_super(struct super_block *sb) EXPORT_SYMBOL(kill_block_super); #endif -struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - int error; - struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); - - if (IS_ERR(s)) - return ERR_CAST(s); - - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= SB_ACTIVE; - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_nodev); - /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..204328ed7ebb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2716,9 +2716,6 @@ static inline bool is_mgtime(const struct inode *inode) extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); -extern struct dentry *mount_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); -- cgit v1.2.3 From f7d161c2804f3ad36bdc3222cb93c8fee67be98c Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 23 Jul 2025 14:21:55 +0100 Subject: fs: Remove mount_bdev mount_bdev has no in-tree users ever since f2fs adopted the new mount API. Remove it. Signed-off-by: Pedro Falcato Link: https://lore.kernel.org/20250723132156.225410-3-pfalcato@suse.de Signed-off-by: Christian Brauner --- fs/super.c | 43 ------------------------------------------- include/linux/fs.h | 3 --- 2 files changed, 46 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 7daa20737f2e..a038848e8d1f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1716,49 +1716,6 @@ int get_tree_bdev(struct fs_context *fc, } EXPORT_SYMBOL(get_tree_bdev); -static int test_bdev_super(struct super_block *s, void *data) -{ - return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; -} - -struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *s; - int error; - dev_t dev; - - error = lookup_bdev(dev_name, &dev); - if (error) - return ERR_PTR(error); - - flags |= SB_NOSEC; - s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); - if (IS_ERR(s)) - return ERR_CAST(s); - - if (s->s_root) { - if ((flags ^ s->s_flags) & SB_RDONLY) { - deactivate_locked_super(s); - return ERR_PTR(-EBUSY); - } - } else { - error = setup_bdev_super(s, flags, NULL); - if (!error) - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - s->s_flags |= SB_ACTIVE; - } - - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_bdev); - void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; diff --git a/include/linux/fs.h b/include/linux/fs.h index 204328ed7ebb..98afcf455b28 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2713,9 +2713,6 @@ static inline bool is_mgtime(const struct inode *inode) return inode->i_opflags & IOP_MGTIME; } -extern struct dentry *mount_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); -- cgit v1.2.3 From ad7fe23b4b0dc0c26187df92a5649948ef7049fa Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 6 Aug 2025 16:07:05 +1000 Subject: fscontext: add custom-prefix log helpers Sometimes, errors associated with an fscontext come from the VFS or otherwise outside of the filesystem driver itself. However, the default logging of errorfc will always prefix the message with the filesystem name. So, add some *fcp() wrappers that allow for custom prefixes to be used when emitting information to the fscontext log. Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250806-errorfc-mount-too-revealing-v2-1-534b9b4d45bb@cyphar.com Signed-off-by: Christian Brauner --- include/linux/fs_context.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 7773eb870039..671f031be173 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -186,10 +186,12 @@ struct fc_log { extern __attribute__((format(printf, 4, 5))) void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...); -#define __logfc(fc, l, fmt, ...) logfc((fc)->log.log, NULL, \ - l, fmt, ## __VA_ARGS__) -#define __plog(p, l, fmt, ...) logfc((p)->log, (p)->prefix, \ - l, fmt, ## __VA_ARGS__) +#define __logfc(fc, l, fmt, ...) \ + logfc((fc)->log.log, NULL, (l), (fmt), ## __VA_ARGS__) +#define __plogp(p, prefix, l, fmt, ...) \ + logfc((p)->log, (prefix), (l), (fmt), ## __VA_ARGS__) +#define __plog(p, l, fmt, ...) __plogp(p, (p)->prefix, l, fmt, ## __VA_ARGS__) + /** * infof - Store supplementary informational message * @fc: The context in which to log the informational message @@ -201,6 +203,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__) #define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__) #define infofc(fc, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__) +#define infofcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'i', fmt, ## __VA_ARGS__) /** * warnf - Store supplementary warning message @@ -213,6 +217,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define warnf(fc, fmt, ...) __logfc(fc, 'w', fmt, ## __VA_ARGS__) #define warn_plog(p, fmt, ...) __plog(p, 'w', fmt, ## __VA_ARGS__) #define warnfc(fc, fmt, ...) __plog((&(fc)->log), 'w', fmt, ## __VA_ARGS__) +#define warnfcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'w', fmt, ## __VA_ARGS__) /** * errorf - Store supplementary error message @@ -225,6 +231,8 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define errorf(fc, fmt, ...) __logfc(fc, 'e', fmt, ## __VA_ARGS__) #define error_plog(p, fmt, ...) __plog(p, 'e', fmt, ## __VA_ARGS__) #define errorfc(fc, fmt, ...) __plog((&(fc)->log), 'e', fmt, ## __VA_ARGS__) +#define errorfcp(fc, prefix, fmt, ...) \ + __plogp((&(fc)->log), prefix, 'e', fmt, ## __VA_ARGS__) /** * invalf - Store supplementary invalid argument error message @@ -237,5 +245,7 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, #define invalf(fc, fmt, ...) (errorf(fc, fmt, ## __VA_ARGS__), -EINVAL) #define inval_plog(p, fmt, ...) (error_plog(p, fmt, ## __VA_ARGS__), -EINVAL) #define invalfc(fc, fmt, ...) (errorfc(fc, fmt, ## __VA_ARGS__), -EINVAL) +#define invalfcp(fc, prefix, fmt, ...) \ + (errorfcp(fc, prefix, fmt, ## __VA_ARGS__), -EINVAL) #endif /* _LINUX_FS_CONTEXT_H */ -- cgit v1.2.3 From bb2441402392ef1f49563be68e8f0dcb127ac965 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Tue, 5 Aug 2025 22:40:56 +0300 Subject: regulator: add s2dos05 regulator support S2DOS05 has 1 buck and 4 LDO regulators, used for powering panel/touchscreen. Signed-off-by: Dzmitry Sankouski Link: https://patch.msgid.link/20250805-starqltechn_integration_upstream-v8-1-09d8a321fafe@gmail.com Signed-off-by: Mark Brown --- MAINTAINERS | 2 +- drivers/regulator/Kconfig | 8 ++ drivers/regulator/Makefile | 1 + drivers/regulator/s2dos05-regulator.c | 165 ++++++++++++++++++++++++++++++++++ include/linux/regulator/s2dos05.h | 73 +++++++++++++++ 5 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 drivers/regulator/s2dos05-regulator.c create mode 100644 include/linux/regulator/s2dos05.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa4..f35f9b503956 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22387,7 +22387,7 @@ F: Documentation/devicetree/bindings/regulator/samsung,s2m*.yaml F: Documentation/devicetree/bindings/regulator/samsung,s5m*.yaml F: drivers/clk/clk-s2mps11.c F: drivers/mfd/sec*.[ch] -F: drivers/regulator/s2m*.c +F: drivers/regulator/s2*.c F: drivers/regulator/s5m*.c F: drivers/rtc/rtc-s5m.c F: include/linux/mfd/samsung/ diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig index eaa6df1c9f80..2399c8a9a1c5 100644 --- a/drivers/regulator/Kconfig +++ b/drivers/regulator/Kconfig @@ -1344,6 +1344,14 @@ config REGULATOR_RTQ2208 and two ldos. It features wide output voltage range from 0.4V to 2.05V and the capability to configure the corresponding power stages. +config REGULATOR_S2DOS05 + tristate "Samsung S2DOS05 voltage regulator" + depends on MFD_SEC_CORE || COMPILE_TEST + help + This driver provides support for the voltage regulators of the S2DOS05. + The S2DOS05 is a companion power management IC for the smart phones. + The S2DOS05 has 4 LDOs and 1 BUCK outputs. + config REGULATOR_S2MPA01 tristate "Samsung S2MPA01 voltage regulator" depends on MFD_SEC_CORE || COMPILE_TEST diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile index be98b29d6675..78605b0fa0b2 100644 --- a/drivers/regulator/Makefile +++ b/drivers/regulator/Makefile @@ -156,6 +156,7 @@ obj-$(CONFIG_REGULATOR_RTMV20) += rtmv20-regulator.o obj-$(CONFIG_REGULATOR_RTQ2134) += rtq2134-regulator.o obj-$(CONFIG_REGULATOR_RTQ6752) += rtq6752-regulator.o obj-$(CONFIG_REGULATOR_RTQ2208) += rtq2208-regulator.o +obj-$(CONFIG_REGULATOR_S2DOS05) += s2dos05-regulator.o obj-$(CONFIG_REGULATOR_S2MPA01) += s2mpa01.o obj-$(CONFIG_REGULATOR_S2MPS11) += s2mps11.o obj-$(CONFIG_REGULATOR_S5M8767) += s5m8767.o diff --git a/drivers/regulator/s2dos05-regulator.c b/drivers/regulator/s2dos05-regulator.c new file mode 100644 index 000000000000..1463585c4565 --- /dev/null +++ b/drivers/regulator/s2dos05-regulator.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// s2dos05.c - Regulator driver for the Samsung s2dos05 +// +// Copyright (C) 2025 Dzmitry Sankouski + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct s2dos05_data { + struct regmap *regmap; + struct device *dev; +}; + +#define _BUCK(macro) S2DOS05_BUCK##macro +#define _buck_ops(num) s2dos05_ops##num +#define _LDO(macro) S2DOS05_LDO##macro +#define _REG(ctrl) S2DOS05_REG##ctrl +#define _ldo_ops(num) s2dos05_ops##num +#define _MASK(macro) S2DOS05_ENABLE_MASK##macro +#define _TIME(macro) S2DOS05_ENABLE_TIME##macro + +#define BUCK_DESC(_name, _id, _ops, m, s, v, e, em, t, a) { \ + .name = _name, \ + .id = _id, \ + .ops = _ops, \ + .of_match = of_match_ptr(_name), \ + .of_match_full_name = true, \ + .regulators_node = of_match_ptr("regulators"), \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = m, \ + .uV_step = s, \ + .n_voltages = S2DOS05_BUCK_N_VOLTAGES, \ + .vsel_reg = v, \ + .vsel_mask = S2DOS05_BUCK_VSEL_MASK, \ + .enable_reg = e, \ + .enable_mask = em, \ + .enable_time = t, \ + .active_discharge_off = 0, \ + .active_discharge_on = S2DOS05_BUCK_FD_MASK, \ + .active_discharge_reg = a, \ + .active_discharge_mask = S2DOS05_BUCK_FD_MASK \ +} + +#define LDO_DESC(_name, _id, _ops, m, s, v, e, em, t, a) { \ + .name = _name, \ + .id = _id, \ + .ops = _ops, \ + .of_match = of_match_ptr(_name), \ + .of_match_full_name = true, \ + .regulators_node = of_match_ptr("regulators"), \ + .type = REGULATOR_VOLTAGE, \ + .owner = THIS_MODULE, \ + .min_uV = m, \ + .uV_step = s, \ + .n_voltages = S2DOS05_LDO_N_VOLTAGES, \ + .vsel_reg = v, \ + .vsel_mask = S2DOS05_LDO_VSEL_MASK, \ + .enable_reg = e, \ + .enable_mask = em, \ + .enable_time = t, \ + .active_discharge_off = 0, \ + .active_discharge_on = S2DOS05_LDO_FD_MASK, \ + .active_discharge_reg = a, \ + .active_discharge_mask = S2DOS05_LDO_FD_MASK \ +} + +static const struct regulator_ops s2dos05_ops = { + .list_voltage = regulator_list_voltage_linear, + .map_voltage = regulator_map_voltage_linear, + .is_enabled = regulator_is_enabled_regmap, + .enable = regulator_enable_regmap, + .disable = regulator_disable_regmap, + .get_voltage_sel = regulator_get_voltage_sel_regmap, + .set_voltage_sel = regulator_set_voltage_sel_regmap, + .set_voltage_time_sel = regulator_set_voltage_time_sel, + .set_active_discharge = regulator_set_active_discharge_regmap, +}; + +static const struct regulator_desc regulators[S2DOS05_REGULATOR_MAX] = { + // name, id, ops, min_uv, uV_step, vsel_reg, enable_reg + LDO_DESC("ldo1", _LDO(1), &_ldo_ops(), _LDO(_MIN1), + _LDO(_STEP1), _REG(_LDO1_CFG), + _REG(_EN), _MASK(_L1), _TIME(_LDO), _REG(_LDO1_CFG)), + LDO_DESC("ldo2", _LDO(2), &_ldo_ops(), _LDO(_MIN1), + _LDO(_STEP1), _REG(_LDO2_CFG), + _REG(_EN), _MASK(_L2), _TIME(_LDO), _REG(_LDO2_CFG)), + LDO_DESC("ldo3", _LDO(3), &_ldo_ops(), _LDO(_MIN2), + _LDO(_STEP1), _REG(_LDO3_CFG), + _REG(_EN), _MASK(_L3), _TIME(_LDO), _REG(_LDO3_CFG)), + LDO_DESC("ldo4", _LDO(4), &_ldo_ops(), _LDO(_MIN2), + _LDO(_STEP1), _REG(_LDO4_CFG), + _REG(_EN), _MASK(_L4), _TIME(_LDO), _REG(_LDO4_CFG)), + BUCK_DESC("buck", _BUCK(1), &_buck_ops(), _BUCK(_MIN1), + _BUCK(_STEP1), _REG(_BUCK_VOUT), + _REG(_EN), _MASK(_B1), _TIME(_BUCK), _REG(_BUCK_CFG)), +}; + +static int s2dos05_pmic_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct sec_pmic_dev *iodev = dev_get_drvdata(pdev->dev.parent); + struct s2dos05_data *s2dos05; + struct regulator_config config = { }; + unsigned int rdev_num = ARRAY_SIZE(regulators); + + s2dos05 = devm_kzalloc(dev, sizeof(*s2dos05), GFP_KERNEL); + if (!s2dos05) + return -ENOMEM; + + platform_set_drvdata(pdev, s2dos05); + + s2dos05->regmap = iodev->regmap_pmic; + s2dos05->dev = dev; + if (!dev->of_node) + dev->of_node = dev->parent->of_node; + + config.dev = dev; + config.driver_data = s2dos05; + + for (int i = 0; i < rdev_num; i++) { + struct regulator_dev *regulator; + + regulator = devm_regulator_register(&pdev->dev, + ®ulators[i], &config); + if (IS_ERR(regulator)) { + return dev_err_probe(&pdev->dev, PTR_ERR(regulator), + "regulator init failed for %d\n", i); + } + } + + return 0; +} + +static const struct platform_device_id s2dos05_pmic_id[] = { + { "s2dos05-regulator" }, + { }, +}; +MODULE_DEVICE_TABLE(platform, s2dos05_pmic_id); + +static struct platform_driver s2dos05_platform_driver = { + .driver = { + .name = "s2dos05", + }, + .probe = s2dos05_pmic_probe, + .id_table = s2dos05_pmic_id, +}; +module_platform_driver(s2dos05_platform_driver); + +MODULE_AUTHOR("Dzmitry Sankouski "); +MODULE_DESCRIPTION("Samsung S2DOS05 Regulator Driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/regulator/s2dos05.h b/include/linux/regulator/s2dos05.h new file mode 100644 index 000000000000..2e89fcbce769 --- /dev/null +++ b/include/linux/regulator/s2dos05.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +// s2dos05.h +// +// Copyright (c) 2016 Samsung Electronics Co., Ltd +// http://www.samsung.com +// Copyright (C) 2024 Dzmitry Sankouski + +#ifndef __LINUX_S2DOS05_H +#define __LINUX_S2DOS05_H + +// S2DOS05 registers +// Slave Addr : 0xC0 +enum S2DOS05_reg { + S2DOS05_REG_DEV_ID, + S2DOS05_REG_TOPSYS_STAT, + S2DOS05_REG_STAT, + S2DOS05_REG_EN, + S2DOS05_REG_LDO1_CFG, + S2DOS05_REG_LDO2_CFG, + S2DOS05_REG_LDO3_CFG, + S2DOS05_REG_LDO4_CFG, + S2DOS05_REG_BUCK_CFG, + S2DOS05_REG_BUCK_VOUT, + S2DOS05_REG_IRQ_MASK = 0x0D, + S2DOS05_REG_SSD_TSD = 0x0E, + S2DOS05_REG_OCL = 0x10, + S2DOS05_REG_IRQ = 0x11 +}; + +// S2DOS05 regulator ids +enum S2DOS05_regulators { + S2DOS05_LDO1, + S2DOS05_LDO2, + S2DOS05_LDO3, + S2DOS05_LDO4, + S2DOS05_BUCK1, + S2DOS05_REG_MAX, +}; + +#define S2DOS05_IRQ_PWRMT_MASK BIT(5) +#define S2DOS05_IRQ_TSD_MASK BIT(4) +#define S2DOS05_IRQ_SSD_MASK BIT(3) +#define S2DOS05_IRQ_SCP_MASK BIT(2) +#define S2DOS05_IRQ_UVLO_MASK BIT(1) +#define S2DOS05_IRQ_OCD_MASK BIT(0) + +#define S2DOS05_BUCK_MIN1 506250 +#define S2DOS05_LDO_MIN1 1500000 +#define S2DOS05_LDO_MIN2 2700000 +#define S2DOS05_BUCK_STEP1 6250 +#define S2DOS05_LDO_STEP1 25000 +#define S2DOS05_LDO_VSEL_MASK 0x7F +#define S2DOS05_LDO_FD_MASK 0x80 +#define S2DOS05_BUCK_VSEL_MASK 0xFF +#define S2DOS05_BUCK_FD_MASK 0x08 + +#define S2DOS05_ENABLE_MASK_L1 BIT(0) +#define S2DOS05_ENABLE_MASK_L2 BIT(1) +#define S2DOS05_ENABLE_MASK_L3 BIT(2) +#define S2DOS05_ENABLE_MASK_L4 BIT(3) +#define S2DOS05_ENABLE_MASK_B1 BIT(4) + +#define S2DOS05_RAMP_DELAY 12000 + +#define S2DOS05_ENABLE_TIME_LDO 50 +#define S2DOS05_ENABLE_TIME_BUCK 350 + +#define S2DOS05_LDO_N_VOLTAGES (S2DOS05_LDO_VSEL_MASK + 1) +#define S2DOS05_BUCK_N_VOLTAGES (S2DOS05_BUCK_VSEL_MASK + 1) + +#define S2DOS05_REGULATOR_MAX (S2DOS05_REG_MAX) + +#endif // __LINUX_S2DOS05_H -- cgit v1.2.3 From 181fe022ecf8a8e85def0e94852c631c59a8b3f6 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:44 +0200 Subject: gpiolib: add support to register sparse pin range Add support to register for GPIO<->pin mapping using a list of non consecutive pins. The core already supports sparse pin range (pins member of struct pinctrl_gpio_range), but it was not possible to register one. If pins is not NULL the core uses it, otherwise it assumes that a consecutive pin range was registered and it uses pin_base. The function gpiochip_add_pin_range() which allocates and fills the struct pinctrl_gpio_range was renamed to gpiochip_add_pin_range_with_pins() and the pins parameter was added. Two new functions were added, gpiochip_add_pin_range() and gpiochip_add_sparse_pin_range() to register a consecutive or sparse pins range. Both use gpiochip_add_pin_range_with_pins(). Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-1-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 29 ++++++++++++++++++-------- include/linux/gpio/driver.h | 51 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 68 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 0d2b470a252e..98d2fa602490 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2349,11 +2349,13 @@ int gpiochip_add_pingroup_range(struct gpio_chip *gc, EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range); /** - * gpiochip_add_pin_range() - add a range for GPIO <-> pin mapping + * gpiochip_add_pin_range_with_pins() - add a range for GPIO <-> pin mapping * @gc: the gpiochip to add the range for * @pinctl_name: the dev_name() of the pin controller to map to * @gpio_offset: the start offset in the current gpio_chip number space * @pin_offset: the start offset in the pin controller number space + * @pins: the list of non consecutive pins to accumulate in this range (if not + * NULL, pin_offset is ignored by pinctrl core) * @npins: the number of pins from the offset of each pin space (GPIO and * pin controller) to accumulate in this range * @@ -2365,9 +2367,12 @@ EXPORT_SYMBOL_GPL(gpiochip_add_pingroup_range); * Returns: * 0 on success, or a negative errno on failure. */ -int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, - unsigned int gpio_offset, unsigned int pin_offset, - unsigned int npins) +int gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int const *pins, + unsigned int npins) { struct gpio_pin_range *pin_range; struct gpio_device *gdev = gc->gpiodev; @@ -2385,6 +2390,7 @@ int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, pin_range->range.name = gc->label; pin_range->range.base = gdev->base + gpio_offset; pin_range->range.pin_base = pin_offset; + pin_range->range.pins = pins; pin_range->range.npins = npins; pin_range->pctldev = pinctrl_find_and_add_gpio_range(pinctl_name, &pin_range->range); @@ -2394,16 +2400,21 @@ int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, kfree(pin_range); return ret; } - chip_dbg(gc, "created GPIO range %d->%d ==> %s PIN %d->%d\n", - gpio_offset, gpio_offset + npins - 1, - pinctl_name, - pin_offset, pin_offset + npins - 1); + if (pin_range->range.pins) + chip_dbg(gc, "created GPIO range %d->%d ==> %s %d sparse PIN range { %d, ... }", + gpio_offset, gpio_offset + npins - 1, + pinctl_name, npins, pins[0]); + else + chip_dbg(gc, "created GPIO range %d->%d ==> %s PIN %d->%d\n", + gpio_offset, gpio_offset + npins - 1, + pinctl_name, + pin_offset, pin_offset + npins - 1); list_add_tail(&pin_range->node, &gdev->pin_ranges); return 0; } -EXPORT_SYMBOL_GPL(gpiochip_add_pin_range); +EXPORT_SYMBOL_GPL(gpiochip_add_pin_range_with_pins); /** * gpiochip_remove_pin_ranges() - remove all the GPIO <-> pin mappings diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 667f8fd58a79..9fcd4a988081 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -772,16 +772,50 @@ struct gpio_pin_range { #ifdef CONFIG_PINCTRL -int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, - unsigned int gpio_offset, unsigned int pin_offset, - unsigned int npins); +int gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int const *pins, + unsigned int npins); int gpiochip_add_pingroup_range(struct gpio_chip *gc, struct pinctrl_dev *pctldev, unsigned int gpio_offset, const char *pin_group); void gpiochip_remove_pin_ranges(struct gpio_chip *gc); +static inline int +gpiochip_add_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int npins) +{ + return gpiochip_add_pin_range_with_pins(gc, pinctl_name, gpio_offset, + pin_offset, NULL, npins); +} + +static inline int +gpiochip_add_sparse_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int const *pins, + unsigned int npins) +{ + return gpiochip_add_pin_range_with_pins(gc, pinctl_name, gpio_offset, 0, + pins, npins); +} #else /* ! CONFIG_PINCTRL */ +static inline int +gpiochip_add_pin_range_with_pins(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int pin_offset, + unsigned int npins) +{ + return 0; +} + static inline int gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, unsigned int gpio_offset, unsigned int pin_offset, @@ -789,6 +823,17 @@ gpiochip_add_pin_range(struct gpio_chip *gc, const char *pinctl_name, { return 0; } + +static inline int +gpiochip_add_sparse_pin_range(struct gpio_chip *gc, + const char *pinctl_name, + unsigned int gpio_offset, + unsigned int const *pins, + unsigned int npins) +{ + return 0; +} + static inline int gpiochip_add_pingroup_range(struct gpio_chip *gc, struct pinctrl_dev *pctldev, -- cgit v1.2.3 From 6e986f8852f56cf9214ea2ec02b4b432e201d02c Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:49 +0200 Subject: gpio: aggregator: export symbols of the GPIO forwarder library Export all symbols and create header file for the GPIO forwarder library. It will be used in the next changes. Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-6-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 202 +++++++++++++++++++++++++++++++++++++++-- include/linux/gpio/forwarder.h | 37 ++++++++ 2 files changed, 233 insertions(+), 6 deletions(-) create mode 100644 include/linux/gpio/forwarder.h (limited to 'include/linux') diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index cc54d19f2dd1..c1952b28b7b7 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include +#include #include #include "dev-sync-probe.h" @@ -475,8 +477,180 @@ static int gpiochip_fwd_setup_delay_line(struct gpiochip_fwd *fwd) } #endif /* !CONFIG_OF_GPIO */ -static struct gpiochip_fwd * -devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios) +/** + * gpiochip_fwd_get_gpiochip - Get the GPIO chip for the GPIO forwarder + * @fwd: GPIO forwarder + * + * Returns: The GPIO chip for the GPIO forwarder + */ +struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd) +{ + return &fwd->chip; +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_gpiochip, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get_direction - Return the current direction of a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: 0 for output, 1 for input, or an error code in case of error. + */ +int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get_direction(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get_direction, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_direction_output - Set a GPIO forwarder line direction to + * output + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @value: value to set + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_direction_output(struct gpiochip_fwd *fwd, unsigned int offset, + int value) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_direction_output(gc, offset, value); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_direction_output, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_direction_input - Set a GPIO forwarder line direction to input + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_direction_input(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_direction_input, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get - Return a GPIO forwarder line's value + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: The GPIO's logical value, i.e. taking the ACTIVE_LOW status into + * account, or negative errno on failure. + */ +int gpiochip_fwd_gpio_get(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_get_multiple - Get values for multiple GPIO forwarder lines + * @fwd: GPIO forwarder + * @mask: bit mask array; one bit per line; BITS_PER_LONG bits per word defines + * which lines are to be read + * @bits: bit value array; one bit per line; BITS_PER_LONG bits per word will + * contains the read values for the lines specified by mask + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_get_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, + unsigned long *bits) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_get_multiple_locked(gc, mask, bits); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_get_multiple, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set - Assign value to a GPIO forwarder line. + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @value: value to set + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_set(struct gpiochip_fwd *fwd, unsigned int offset, int value) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set(gc, offset, value); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set_multiple - Assign values to multiple GPIO forwarder lines + * @fwd: GPIO forwarder + * @mask: bit mask array; one bit per output; BITS_PER_LONG bits per word + * defines which outputs are to be changed + * @bits: bit value array; one bit per output; BITS_PER_LONG bits per word + * defines the values the outputs specified by mask are to be set to + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_set_multiple(struct gpiochip_fwd *fwd, unsigned long *mask, + unsigned long *bits) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set_multiple_locked(gc, mask, bits); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set_multiple, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_set_config - Set @config for a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * @config: Same packed config format as generic pinconf + * + * Returns: 0 on success, %-ENOTSUPP if the controller doesn't support setting + * the configuration. + */ +int gpiochip_fwd_gpio_set_config(struct gpiochip_fwd *fwd, unsigned int offset, + unsigned long config) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_set_config(gc, offset, config); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_set_config, "GPIO_FORWARDER"); + +/** + * gpiochip_fwd_gpio_to_irq - Return the IRQ corresponding to a GPIO forwarder line + * @fwd: GPIO forwarder + * @offset: the offset of the line + * + * Returns: The Linux IRQ corresponding to the passed line, or an error code in + * case of error. + */ +int gpiochip_fwd_gpio_to_irq(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_to_irq(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_to_irq, "GPIO_FORWARDER"); + +/** + * devm_gpiochip_fwd_alloc - Allocate and initialize a new GPIO forwarder + * @dev: Parent device pointer + * @ngpios: Number of GPIOs in the forwarder + * + * Returns: An opaque object pointer, or an ERR_PTR()-encoded negative error + * code on failure. + */ +struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, + unsigned int ngpios) { struct gpiochip_fwd *fwd; struct gpio_chip *chip; @@ -507,10 +681,18 @@ devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios) return fwd; } +EXPORT_SYMBOL_NS_GPL(devm_gpiochip_fwd_alloc, "GPIO_FORWARDER"); -static int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, - struct gpio_desc *desc, - unsigned int offset) +/** + * gpiochip_fwd_desc_add - Add a GPIO desc in the forwarder + * @fwd: GPIO forwarder + * @desc: GPIO descriptor to register + * @offset: offset for the GPIO in the forwarder + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, + unsigned int offset) { struct gpio_chip *parent = gpiod_to_chip(desc); struct gpio_chip *chip = &fwd->chip; @@ -537,8 +719,15 @@ static int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, return 0; } +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_add, "GPIO_FORWARDER"); -static int gpiochip_fwd_register(struct gpiochip_fwd *fwd) +/** + * gpiochip_fwd_register - Register a GPIO forwarder + * @fwd: GPIO forwarder + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_register(struct gpiochip_fwd *fwd) { struct gpio_chip *chip = &fwd->chip; @@ -549,6 +738,7 @@ static int gpiochip_fwd_register(struct gpiochip_fwd *fwd) return devm_gpiochip_add_data(chip->parent, chip, fwd); } +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_register, "GPIO_FORWARDER"); /** * gpiochip_fwd_create() - Create a new GPIO forwarder diff --git a/include/linux/gpio/forwarder.h b/include/linux/gpio/forwarder.h new file mode 100644 index 000000000000..e21a1b7b1905 --- /dev/null +++ b/include/linux/gpio/forwarder.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_GPIO_FORWARDER_H +#define __LINUX_GPIO_FORWARDER_H + +struct gpio_desc; +struct gpio_chip; +struct gpiochip_fwd; + +struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, + unsigned int ngpios); +int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, + struct gpio_desc *desc, unsigned int offset); +int gpiochip_fwd_register(struct gpiochip_fwd *fwd); + +struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd); + +int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, + unsigned int offset); +int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, + unsigned int offset); +int gpiochip_fwd_gpio_direction_output(struct gpiochip_fwd *fwd, + unsigned int offset, + int value); +int gpiochip_fwd_gpio_get(struct gpiochip_fwd *fwd, unsigned int offset); +int gpiochip_fwd_gpio_get_multiple(struct gpiochip_fwd *fwd, + unsigned long *mask, + unsigned long *bits); +int gpiochip_fwd_gpio_set(struct gpiochip_fwd *fwd, unsigned int offset, + int value); +int gpiochip_fwd_gpio_set_multiple(struct gpiochip_fwd *fwd, + unsigned long *mask, + unsigned long *bits); +int gpiochip_fwd_gpio_set_config(struct gpiochip_fwd *fwd, unsigned int offset, + unsigned long config); +int gpiochip_fwd_gpio_to_irq(struct gpiochip_fwd *fwd, unsigned int offset); + +#endif -- cgit v1.2.3 From b31c68fd851e74526ad963362ea205eb97b9a710 Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:50 +0200 Subject: gpio: aggregator: handle runtime registration of gpio_desc in gpiochip_fwd Add request() callback to check if the GPIO descriptor was well registered in the gpiochip_fwd before using it. This is done to handle the case where GPIO descriptor is added at runtime in the forwarder. If at least one GPIO descriptor was not added before the forwarder registration, we assume the forwarder can sleep as if a GPIO is added at runtime it may sleep. Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-7-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 63 ++++++++++++++++++++++++++++++++++++++---- include/linux/gpio/forwarder.h | 2 ++ 2 files changed, 59 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index c1952b28b7b7..f0d38d76cf73 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -246,6 +246,7 @@ struct gpiochip_fwd { spinlock_t slock; /* protects tmp[] if !can_sleep */ }; struct gpiochip_fwd_timing *delay_timings; + unsigned long *valid_mask; unsigned long tmp[]; /* values and descs for multiple ops */ }; @@ -254,10 +255,24 @@ struct gpiochip_fwd { #define fwd_tmp_size(ngpios) (BITS_TO_LONGS((ngpios)) + (ngpios)) +static int gpio_fwd_request(struct gpio_chip *chip, unsigned int offset) +{ + struct gpiochip_fwd *fwd = gpiochip_get_data(chip); + + return test_bit(offset, fwd->valid_mask) ? 0 : -ENODEV; +} + static int gpio_fwd_get_direction(struct gpio_chip *chip, unsigned int offset) { struct gpiochip_fwd *fwd = gpiochip_get_data(chip); + /* + * get_direction() is called during gpiochip registration, return + * -ENODEV if there is no GPIO desc for the line. + */ + if (!test_bit(offset, fwd->valid_mask)) + return -ENODEV; + return gpiod_get_direction(fwd->descs[offset]); } @@ -489,6 +504,21 @@ struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd) } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_gpiochip, "GPIO_FORWARDER"); +/** + * gpiochip_fwd_gpio_request - Request a line of the GPIO forwarder + * @fwd: GPIO forwarder + * @offset: the offset of the line to request + * + * Returns: 0 on success, or negative errno on failure. + */ +int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset) +{ + struct gpio_chip *gc = gpiochip_fwd_get_gpiochip(fwd); + + return gpio_fwd_request(gc, offset); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_gpio_request, "GPIO_FORWARDER"); + /** * gpiochip_fwd_gpio_get_direction - Return the current direction of a GPIO forwarder line * @fwd: GPIO forwarder @@ -663,11 +693,16 @@ struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, if (!fwd->descs) return ERR_PTR(-ENOMEM); + fwd->valid_mask = devm_bitmap_zalloc(dev, ngpios, GFP_KERNEL); + if (!fwd->valid_mask) + return ERR_PTR(-ENOMEM); + chip = &fwd->chip; chip->label = dev_name(dev); chip->parent = dev; chip->owner = THIS_MODULE; + chip->request = gpio_fwd_request; chip->get_direction = gpio_fwd_get_direction; chip->direction_input = gpio_fwd_direction_input; chip->direction_output = gpio_fwd_direction_output; @@ -694,24 +729,21 @@ EXPORT_SYMBOL_NS_GPL(devm_gpiochip_fwd_alloc, "GPIO_FORWARDER"); int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, unsigned int offset) { - struct gpio_chip *parent = gpiod_to_chip(desc); struct gpio_chip *chip = &fwd->chip; if (offset > chip->ngpio) return -EINVAL; + if (test_and_set_bit(offset, fwd->valid_mask)) + return -EEXIST; + /* * If any of the GPIO lines are sleeping, then the entire forwarder * will be sleeping. - * If any of the chips support .set_config(), then the forwarder will - * support setting configs. */ if (gpiod_cansleep(desc)) chip->can_sleep = true; - if (parent && parent->set_config) - chip->set_config = gpio_fwd_set_config; - fwd->descs[offset] = desc; dev_dbg(chip->parent, "%u => gpio %d irq %d\n", offset, @@ -721,6 +753,18 @@ int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_add, "GPIO_FORWARDER"); +/** + * gpiochip_fwd_desc_free - Remove a GPIO desc from the forwarder + * @fwd: GPIO forwarder + * @offset: offset of GPIO desc to remove + */ +void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset) +{ + if (test_and_clear_bit(offset, fwd->valid_mask)) + gpiod_put(fwd->descs[offset]); +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_free, "GPIO_FORWARDER"); + /** * gpiochip_fwd_register - Register a GPIO forwarder * @fwd: GPIO forwarder @@ -731,6 +775,13 @@ int gpiochip_fwd_register(struct gpiochip_fwd *fwd) { struct gpio_chip *chip = &fwd->chip; + /* + * Some gpio_desc were not registered. They will be registered at runtime + * but we have to suppose they can sleep. + */ + if (!bitmap_full(fwd->valid_mask, chip->ngpio)) + chip->can_sleep = true; + if (chip->can_sleep) mutex_init(&fwd->mlock); else diff --git a/include/linux/gpio/forwarder.h b/include/linux/gpio/forwarder.h index e21a1b7b1905..45e0190308f0 100644 --- a/include/linux/gpio/forwarder.h +++ b/include/linux/gpio/forwarder.h @@ -10,10 +10,12 @@ struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, unsigned int ngpios); int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, unsigned int offset); +void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_register(struct gpiochip_fwd *fwd); struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd); +int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_gpio_direction_input(struct gpiochip_fwd *fwd, -- cgit v1.2.3 From 60e92c1009c7c6abd4a9d0caf33a8cba5d09f67c Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:51 +0200 Subject: gpio: aggregator: add possibility to attach data to the forwarder Add a data pointer to store private data in the forwarder. Reviewed-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-8-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-aggregator.c | 20 ++++++++++++++++++-- include/linux/gpio/forwarder.h | 4 +++- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index f0d38d76cf73..fb3694d581d1 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -246,6 +246,7 @@ struct gpiochip_fwd { spinlock_t slock; /* protects tmp[] if !can_sleep */ }; struct gpiochip_fwd_timing *delay_timings; + void *data; unsigned long *valid_mask; unsigned long tmp[]; /* values and descs for multiple ops */ }; @@ -504,6 +505,18 @@ struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd) } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_gpiochip, "GPIO_FORWARDER"); +/** + * gpiochip_fwd_get_data - Get driver-private data for the GPIO forwarder + * @fwd: GPIO forwarder + * + * Returns: The driver-private data for the GPIO forwarder + */ +void *gpiochip_fwd_get_data(struct gpiochip_fwd *fwd) +{ + return fwd->data; +} +EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_get_data, "GPIO_FORWARDER"); + /** * gpiochip_fwd_gpio_request - Request a line of the GPIO forwarder * @fwd: GPIO forwarder @@ -768,10 +781,11 @@ EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_desc_free, "GPIO_FORWARDER"); /** * gpiochip_fwd_register - Register a GPIO forwarder * @fwd: GPIO forwarder + * @data: driver-private data associated with this forwarder * * Returns: 0 on success, or negative errno on failure. */ -int gpiochip_fwd_register(struct gpiochip_fwd *fwd) +int gpiochip_fwd_register(struct gpiochip_fwd *fwd, void *data) { struct gpio_chip *chip = &fwd->chip; @@ -787,6 +801,8 @@ int gpiochip_fwd_register(struct gpiochip_fwd *fwd) else spin_lock_init(&fwd->slock); + fwd->data = data; + return devm_gpiochip_add_data(chip->parent, chip, fwd); } EXPORT_SYMBOL_NS_GPL(gpiochip_fwd_register, "GPIO_FORWARDER"); @@ -831,7 +847,7 @@ static struct gpiochip_fwd *gpiochip_fwd_create(struct device *dev, return ERR_PTR(error); } - error = gpiochip_fwd_register(fwd); + error = gpiochip_fwd_register(fwd, NULL); if (error) return ERR_PTR(error); diff --git a/include/linux/gpio/forwarder.h b/include/linux/gpio/forwarder.h index 45e0190308f0..ee5d8355f735 100644 --- a/include/linux/gpio/forwarder.h +++ b/include/linux/gpio/forwarder.h @@ -11,10 +11,12 @@ struct gpiochip_fwd *devm_gpiochip_fwd_alloc(struct device *dev, int gpiochip_fwd_desc_add(struct gpiochip_fwd *fwd, struct gpio_desc *desc, unsigned int offset); void gpiochip_fwd_desc_free(struct gpiochip_fwd *fwd, unsigned int offset); -int gpiochip_fwd_register(struct gpiochip_fwd *fwd); +int gpiochip_fwd_register(struct gpiochip_fwd *fwd, void *data); struct gpio_chip *gpiochip_fwd_get_gpiochip(struct gpiochip_fwd *fwd); +void *gpiochip_fwd_get_data(struct gpiochip_fwd *fwd); + int gpiochip_fwd_gpio_request(struct gpiochip_fwd *fwd, unsigned int offset); int gpiochip_fwd_gpio_get_direction(struct gpiochip_fwd *fwd, unsigned int offset); -- cgit v1.2.3 From 53ec9169db1345f04174febb90f88a871fc28d9e Mon Sep 17 00:00:00 2001 From: Thomas Richard Date: Mon, 11 Aug 2025 15:25:52 +0200 Subject: lib/string_choices: Add str_input_output() helper Add str_input_output() helper to return 'input' or 'output' string literal. Also add the inversed variant str_output_input(). Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Thomas Richard Link: https://lore.kernel.org/r/20250811-aaeon-up-board-pinctrl-support-v9-9-29f0cbbdfb30@bootlin.com Signed-off-by: Bartosz Golaszewski --- include/linux/string_choices.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index f3ba4f52ff26..a27c87c954ae 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -41,6 +41,12 @@ static inline const char *str_high_low(bool v) } #define str_low_high(v) str_high_low(!(v)) +static inline const char *str_input_output(bool v) +{ + return v ? "input" : "output"; +} +#define str_output_input(v) str_input_output(!(v)) + static inline const char *str_on_off(bool v) { return v ? "on" : "off"; -- cgit v1.2.3 From 0daf35da397b083ea0ea5407196bb6bd210530ec Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Thu, 7 Aug 2025 13:13:10 +0530 Subject: soc: qcom: mdt_loader: Remove pas id parameter pas id is not used in qcom_mdt_load_no_init() and it should not be used as it is non-PAS specific function and has no relation to PAS specific mechanism. Reviewed-by: Dikshita Agarwal Acked-by: Jeff Johnson # drivers/net/wireless/ath/ath12k/ahb.c Signed-off-by: Mukesh Ojha Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20250807074311.2381713-2-mukesh.ojha@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/media/platform/qcom/venus/firmware.c | 4 ++-- drivers/net/wireless/ath/ath12k/ahb.c | 2 +- drivers/remoteproc/qcom_q6v5_adsp.c | 2 +- drivers/remoteproc/qcom_q6v5_pas.c | 7 +++---- drivers/remoteproc/qcom_q6v5_wcss.c | 2 +- drivers/soc/qcom/mdt_loader.c | 10 ++++------ include/linux/soc/qcom/mdt_loader.h | 7 +++---- 7 files changed, 15 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/media/platform/qcom/venus/firmware.c b/drivers/media/platform/qcom/venus/firmware.c index 66a18830e66d..862d0718f694 100644 --- a/drivers/media/platform/qcom/venus/firmware.c +++ b/drivers/media/platform/qcom/venus/firmware.c @@ -136,8 +136,8 @@ static int venus_load_fw(struct venus_core *core, const char *fwname, ret = qcom_mdt_load(dev, mdt, fwname, VENUS_PAS_ID, mem_va, *mem_phys, *mem_size, NULL); else - ret = qcom_mdt_load_no_init(dev, mdt, fwname, VENUS_PAS_ID, - mem_va, *mem_phys, *mem_size, NULL); + ret = qcom_mdt_load_no_init(dev, mdt, fwname, mem_va, + *mem_phys, *mem_size, NULL); memunmap(mem_va); err_release_fw: diff --git a/drivers/net/wireless/ath/ath12k/ahb.c b/drivers/net/wireless/ath/ath12k/ahb.c index 3b983f4e3268..b30527c402f6 100644 --- a/drivers/net/wireless/ath/ath12k/ahb.c +++ b/drivers/net/wireless/ath/ath12k/ahb.c @@ -414,7 +414,7 @@ static int ath12k_ahb_power_up(struct ath12k_base *ab) goto err_fw2; } - ret = qcom_mdt_load_no_init(dev, fw2, fw2_name, pasid, mem_region, mem_phys, + ret = qcom_mdt_load_no_init(dev, fw2, fw2_name, mem_region, mem_phys, mem_size, &mem_phys); if (ret) { ath12k_err(ab, "Failed to load MDT segments: %d\n", ret); diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c b/drivers/remoteproc/qcom_q6v5_adsp.c index 94af77baa7a1..e98b7e03162c 100644 --- a/drivers/remoteproc/qcom_q6v5_adsp.c +++ b/drivers/remoteproc/qcom_q6v5_adsp.c @@ -317,7 +317,7 @@ static int adsp_load(struct rproc *rproc, const struct firmware *fw) struct qcom_adsp *adsp = rproc->priv; int ret; - ret = qcom_mdt_load_no_init(adsp->dev, fw, rproc->firmware, 0, + ret = qcom_mdt_load_no_init(adsp->dev, fw, rproc->firmware, adsp->mem_region, adsp->mem_phys, adsp->mem_size, &adsp->mem_reloc); if (ret) diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index 02e29171cbbe..55a7da801183 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -242,9 +242,8 @@ static int qcom_pas_load(struct rproc *rproc, const struct firmware *fw) goto release_dtb_firmware; ret = qcom_mdt_load_no_init(pas->dev, pas->dtb_firmware, pas->dtb_firmware_name, - pas->dtb_pas_id, pas->dtb_mem_region, - pas->dtb_mem_phys, pas->dtb_mem_size, - &pas->dtb_mem_reloc); + pas->dtb_mem_region, pas->dtb_mem_phys, + pas->dtb_mem_size, &pas->dtb_mem_reloc); if (ret) goto release_dtb_metadata; } @@ -307,7 +306,7 @@ static int qcom_pas_start(struct rproc *rproc) if (ret) goto disable_px_supply; - ret = qcom_mdt_load_no_init(pas->dev, pas->firmware, rproc->firmware, pas->pas_id, + ret = qcom_mdt_load_no_init(pas->dev, pas->firmware, rproc->firmware, pas->mem_region, pas->mem_phys, pas->mem_size, &pas->mem_reloc); if (ret) diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index 93648734a2f2..07c88623f597 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -757,7 +757,7 @@ static int q6v5_wcss_load(struct rproc *rproc, const struct firmware *fw) int ret; ret = qcom_mdt_load_no_init(wcss->dev, fw, rproc->firmware, - 0, wcss->mem_region, wcss->mem_phys, + wcss->mem_region, wcss->mem_phys, wcss->mem_size, &wcss->mem_reloc); if (ret) return ret; diff --git a/drivers/soc/qcom/mdt_loader.c b/drivers/soc/qcom/mdt_loader.c index dfd15d189087..74c415774657 100644 --- a/drivers/soc/qcom/mdt_loader.c +++ b/drivers/soc/qcom/mdt_loader.c @@ -331,7 +331,7 @@ static bool qcom_mdt_bins_are_split(const struct firmware *fw, const char *fw_na } static int __qcom_mdt_load(struct device *dev, const struct firmware *fw, - const char *fw_name, int pas_id, void *mem_region, + const char *fw_name, void *mem_region, phys_addr_t mem_phys, size_t mem_size, phys_addr_t *reloc_base) { @@ -458,7 +458,7 @@ int qcom_mdt_load(struct device *dev, const struct firmware *fw, if (ret) return ret; - return __qcom_mdt_load(dev, fw, firmware, pas_id, mem_region, mem_phys, + return __qcom_mdt_load(dev, fw, firmware, mem_region, mem_phys, mem_size, reloc_base); } EXPORT_SYMBOL_GPL(qcom_mdt_load); @@ -468,7 +468,6 @@ EXPORT_SYMBOL_GPL(qcom_mdt_load); * @dev: device handle to associate resources with * @fw: firmware object for the mdt file * @firmware: name of the firmware, for construction of segment file names - * @pas_id: PAS identifier * @mem_region: allocated memory region to load firmware into * @mem_phys: physical address of allocated memory region * @mem_size: size of the allocated memory region @@ -477,11 +476,10 @@ EXPORT_SYMBOL_GPL(qcom_mdt_load); * Returns 0 on success, negative errno otherwise. */ int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw, - const char *firmware, int pas_id, - void *mem_region, phys_addr_t mem_phys, + const char *firmware, void *mem_region, phys_addr_t mem_phys, size_t mem_size, phys_addr_t *reloc_base) { - return __qcom_mdt_load(dev, fw, firmware, pas_id, mem_region, mem_phys, + return __qcom_mdt_load(dev, fw, firmware, mem_region, mem_phys, mem_size, reloc_base); } EXPORT_SYMBOL_GPL(qcom_mdt_load_no_init); diff --git a/include/linux/soc/qcom/mdt_loader.h b/include/linux/soc/qcom/mdt_loader.h index 9e8e60421192..8ea8230579a2 100644 --- a/include/linux/soc/qcom/mdt_loader.h +++ b/include/linux/soc/qcom/mdt_loader.h @@ -24,7 +24,7 @@ int qcom_mdt_load(struct device *dev, const struct firmware *fw, phys_addr_t *reloc_base); int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw, - const char *fw_name, int pas_id, void *mem_region, + const char *fw_name, void *mem_region, phys_addr_t mem_phys, size_t mem_size, phys_addr_t *reloc_base); void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len, @@ -54,9 +54,8 @@ static inline int qcom_mdt_load(struct device *dev, const struct firmware *fw, static inline int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw, - const char *fw_name, int pas_id, - void *mem_region, phys_addr_t mem_phys, - size_t mem_size, + const char *fw_name, void *mem_region, + phys_addr_t mem_phys, size_t mem_size, phys_addr_t *reloc_base) { return -ENODEV; -- cgit v1.2.3 From ce8370e2e62a903e18be7dd0e0be2eee079501e1 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Wed, 6 Aug 2025 17:04:07 -0400 Subject: audit: record fanotify event regardless of presence of rules When no audit rules are in place, fanotify event results are unconditionally dropped due to an explicit check for the existence of any audit rules. Given this is a report from another security sub-system, allow it to be recorded regardless of the existence of any audit rules. To test, install and run the fapolicyd daemon with default config. Then as an unprivileged user, create and run a very simple binary that should be denied. Then check for an event with ausearch -m FANOTIFY -ts recent Link: https://issues.redhat.com/browse/RHEL-9065 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index a394614ccd0b..e3f06eba9c6e 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -527,7 +527,7 @@ static inline void audit_log_kern_module(const char *name) static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { - if (!audit_dummy_context()) + if (audit_enabled) __audit_fanotify(response, friar); } -- cgit v1.2.3 From 5816bf4273edb32716a88c796e0b04f0e12962eb Mon Sep 17 00:00:00 2001 From: Blaise Boscaccy Date: Tue, 22 Jul 2025 14:21:34 -0700 Subject: lsm,selinux: Add LSM blob support for BPF objects This patch introduces LSM blob support for BPF maps, programs, and tokens to enable LSM stacking and multiplexing of LSM modules that govern BPF objects. Additionally, the existing BPF hooks used by SELinux have been updated to utilize the new blob infrastructure, removing the assumption of exclusive ownership of the security pointer. Signed-off-by: Blaise Boscaccy [PM: dropped local variable init, style fixes] Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 3 ++ security/security.c | 86 +++++++++++++++++++++++++++++++++++++-- security/selinux/hooks.c | 56 +++++-------------------- security/selinux/include/objsec.h | 20 +++++++++ 4 files changed, 116 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 090d1d3e19fe..79ec5a2bdcca 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -116,6 +116,9 @@ struct lsm_blob_sizes { int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ int lbs_tun_dev; int lbs_bdev; + int lbs_bpf_map; + int lbs_bpf_prog; + int lbs_bpf_token; }; /* diff --git a/security/security.c b/security/security.c index a88ebfca3224..ca126b02d2fe 100644 --- a/security/security.c +++ b/security/security.c @@ -283,6 +283,9 @@ static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) lsm_set_blob_size(&needed->lbs_xattr_count, &blob_sizes.lbs_xattr_count); lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev); + lsm_set_blob_size(&needed->lbs_bpf_map, &blob_sizes.lbs_bpf_map); + lsm_set_blob_size(&needed->lbs_bpf_prog, &blob_sizes.lbs_bpf_prog); + lsm_set_blob_size(&needed->lbs_bpf_token, &blob_sizes.lbs_bpf_token); } /* Prepare LSM for initialization. */ @@ -480,6 +483,9 @@ static void __init ordered_lsm_init(void) init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev); init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); init_debug("bdev blob size = %d\n", blob_sizes.lbs_bdev); + init_debug("bpf map blob size = %d\n", blob_sizes.lbs_bpf_map); + init_debug("bpf prog blob size = %d\n", blob_sizes.lbs_bpf_prog); + init_debug("bpf token blob size = %d\n", blob_sizes.lbs_bpf_token); /* * Create any kmem_caches needed for blobs @@ -827,6 +833,47 @@ static int lsm_bdev_alloc(struct block_device *bdev) GFP_KERNEL); } +#ifdef CONFIG_BPF_SYSCALL +/** + * lsm_bpf_map_alloc - allocate a composite bpf_map blob + * @map: the bpf_map that needs a blob + * + * Allocate the bpf_map blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_map_alloc(struct bpf_map *map) +{ + return lsm_blob_alloc(&map->security, blob_sizes.lbs_bpf_map, GFP_KERNEL); +} + +/** + * lsm_bpf_prog_alloc - allocate a composite bpf_prog blob + * @prog: the bpf_prog that needs a blob + * + * Allocate the bpf_prog blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_prog_alloc(struct bpf_prog *prog) +{ + return lsm_blob_alloc(&prog->aux->security, blob_sizes.lbs_bpf_prog, GFP_KERNEL); +} + +/** + * lsm_bpf_token_alloc - allocate a composite bpf_token blob + * @token: the bpf_token that needs a blob + * + * Allocate the bpf_token blob for all the modules + * + * Returns 0, or -ENOMEM if memory can't be allocated. + */ +static int lsm_bpf_token_alloc(struct bpf_token *token) +{ + return lsm_blob_alloc(&token->security, blob_sizes.lbs_bpf_token, GFP_KERNEL); +} +#endif /* CONFIG_BPF_SYSCALL */ + /** * lsm_early_task - during initialization allocate a composite task blob * @task: the task that needs a blob @@ -5706,7 +5753,16 @@ int security_bpf_prog(struct bpf_prog *prog) int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_map_create, map, attr, token, kernel); + int rc; + + rc = lsm_bpf_map_alloc(map); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_map_create, map, attr, token, kernel); + if (unlikely(rc)) + security_bpf_map_free(map); + return rc; } /** @@ -5725,7 +5781,16 @@ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_prog_load, prog, attr, token, kernel); + int rc; + + rc = lsm_bpf_prog_alloc(prog); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_prog_load, prog, attr, token, kernel); + if (unlikely(rc)) + security_bpf_prog_free(prog); + return rc; } /** @@ -5742,7 +5807,16 @@ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { - return call_int_hook(bpf_token_create, token, attr, path); + int rc; + + rc = lsm_bpf_token_alloc(token); + if (unlikely(rc)) + return rc; + + rc = call_int_hook(bpf_token_create, token, attr, path); + if (unlikely(rc)) + security_bpf_token_free(token); + return rc; } /** @@ -5786,6 +5860,8 @@ int security_bpf_token_capable(const struct bpf_token *token, int cap) void security_bpf_map_free(struct bpf_map *map) { call_void_hook(bpf_map_free, map); + kfree(map->security); + map->security = NULL; } /** @@ -5797,6 +5873,8 @@ void security_bpf_map_free(struct bpf_map *map) void security_bpf_prog_free(struct bpf_prog *prog) { call_void_hook(bpf_prog_free, prog); + kfree(prog->aux->security); + prog->aux->security = NULL; } /** @@ -5808,6 +5886,8 @@ void security_bpf_prog_free(struct bpf_prog *prog) void security_bpf_token_free(struct bpf_token *token) { call_void_hook(bpf_token_free, token); + kfree(token->security); + token->security = NULL; } #endif /* CONFIG_BPF_SYSCALL */ diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..4da5e792b42e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7062,14 +7062,14 @@ static int bpf_fd_pass(const struct file *file, u32 sid) if (file->f_op == &bpf_map_fops) { map = file->private_data; - bpfsec = map->security; + bpfsec = selinux_bpf_map_security(map); ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(file->f_mode), NULL); if (ret) return ret; } else if (file->f_op == &bpf_prog_fops) { prog = file->private_data; - bpfsec = prog->aux->security; + bpfsec = selinux_bpf_prog_security(prog); ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); if (ret) @@ -7083,7 +7083,7 @@ static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode) u32 sid = current_sid(); struct bpf_security_struct *bpfsec; - bpfsec = map->security; + bpfsec = selinux_bpf_map_security(map); return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, bpf_map_fmode_to_av(fmode), NULL); } @@ -7093,7 +7093,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog) u32 sid = current_sid(); struct bpf_security_struct *bpfsec; - bpfsec = prog->aux->security; + bpfsec = selinux_bpf_prog_security(prog); return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF, BPF__PROG_RUN, NULL); } @@ -7103,69 +7103,33 @@ static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_map_security(map); bpfsec->sid = current_sid(); - map->security = bpfsec; return 0; } -static void selinux_bpf_map_free(struct bpf_map *map) -{ - struct bpf_security_struct *bpfsec = map->security; - - map->security = NULL; - kfree(bpfsec); -} - static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token, bool kernel) { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_prog_security(prog); bpfsec->sid = current_sid(); - prog->aux->security = bpfsec; return 0; } -static void selinux_bpf_prog_free(struct bpf_prog *prog) -{ - struct bpf_security_struct *bpfsec = prog->aux->security; - - prog->aux->security = NULL; - kfree(bpfsec); -} - static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path) { struct bpf_security_struct *bpfsec; - bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); - if (!bpfsec) - return -ENOMEM; - + bpfsec = selinux_bpf_token_security(token); bpfsec->sid = current_sid(); - token->security = bpfsec; return 0; } - -static void selinux_bpf_token_free(struct bpf_token *token) -{ - struct bpf_security_struct *bpfsec = token->security; - - token->security = NULL; - kfree(bpfsec); -} #endif struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { @@ -7183,6 +7147,9 @@ struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = { .lbs_xattr_count = SELINUX_INODE_INIT_XATTRS, .lbs_tun_dev = sizeof(struct tun_security_struct), .lbs_ib = sizeof(struct ib_security_struct), + .lbs_bpf_map = sizeof(struct bpf_security_struct), + .lbs_bpf_prog = sizeof(struct bpf_security_struct), + .lbs_bpf_token = sizeof(struct bpf_security_struct), }; #ifdef CONFIG_PERF_EVENTS @@ -7536,9 +7503,6 @@ static struct security_hook_list selinux_hooks[] __ro_after_init = { LSM_HOOK_INIT(bpf, selinux_bpf), LSM_HOOK_INIT(bpf_map, selinux_bpf_map), LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog), - LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free), - LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free), - LSM_HOOK_INIT(bpf_token_free, selinux_bpf_token_free), #endif #ifdef CONFIG_PERF_EVENTS diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 1d7ac59015a1..2d5139c6d45b 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -26,6 +26,7 @@ #include #include #include +#include #include "flask.h" #include "avc.h" @@ -245,4 +246,23 @@ selinux_perf_event(void *perf_event) return perf_event + selinux_blob_sizes.lbs_perf_event; } +#ifdef CONFIG_BPF_SYSCALL +static inline struct bpf_security_struct * +selinux_bpf_map_security(struct bpf_map *map) +{ + return map->security + selinux_blob_sizes.lbs_bpf_map; +} + +static inline struct bpf_security_struct * +selinux_bpf_prog_security(struct bpf_prog *prog) +{ + return prog->aux->security + selinux_blob_sizes.lbs_bpf_prog; +} + +static inline struct bpf_security_struct * +selinux_bpf_token_security(struct bpf_token *token) +{ + return token->security + selinux_blob_sizes.lbs_bpf_token; +} +#endif /* CONFIG_BPF_SYSCALL */ #endif /* _SELINUX_OBJSEC_H_ */ -- cgit v1.2.3 From bea90085dcb0f9a75748e73d723bde557a5ebf1a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 23 Jul 2025 11:21:53 -0400 Subject: dlm: use defines for force values in dlm_release_lockspace Clarify the use of the force parameter by renaming it to "release_option" and adding defines (with descriptions) for each of the accepted values. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- drivers/md/md-cluster.c | 4 ++-- fs/dlm/lockspace.c | 20 +++++++++----------- fs/dlm/user.c | 6 +++--- fs/gfs2/lock_dlm.c | 4 ++-- fs/ocfs2/stack_user.c | 2 +- include/linux/dlm.h | 26 +++++++++++++++++++++++++- 6 files changed, 42 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 5497eaee96e7..6e9a0045f0ff 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -979,7 +979,7 @@ err: lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); if (cinfo->lockspace) - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); mddev->cluster_info = NULL; kfree(cinfo); return ret; @@ -1042,7 +1042,7 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); unlock_all_bitmaps(mddev); - dlm_release_lockspace(cinfo->lockspace, 2); + dlm_release_lockspace(cinfo->lockspace, DLM_RELEASE_NORMAL); kfree(cinfo); return 0; } diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index ee11a70def92..be8dbf482229 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -671,19 +671,20 @@ int dlm_new_user_lockspace(const char *name, const char *cluster, This is because there may be LKBs queued as ASTs that have been unlinked from their RSBs and are pending deletion once the AST has been delivered */ -static int lockspace_busy(struct dlm_ls *ls, int force) +static int lockspace_busy(struct dlm_ls *ls, int release_option) { struct dlm_lkb *lkb; unsigned long id; int rv = 0; read_lock_bh(&ls->ls_lkbxa_lock); - if (force == 0) { + if (release_option == DLM_RELEASE_NO_LOCKS) { xa_for_each(&ls->ls_lkbxa, id, lkb) { rv = 1; break; } - } else if (force == 1) { + } else if (release_option == DLM_RELEASE_UNUSED) { + /* TODO: handle this UNUSED option as NO_LOCKS in later patch */ xa_for_each(&ls->ls_lkbxa, id, lkb) { if (lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV) { @@ -698,11 +699,11 @@ static int lockspace_busy(struct dlm_ls *ls, int force) return rv; } -static int release_lockspace(struct dlm_ls *ls, int force) +static int release_lockspace(struct dlm_ls *ls, int release_option) { int busy, rv; - busy = lockspace_busy(ls, force); + busy = lockspace_busy(ls, release_option); spin_lock_bh(&lslist_lock); if (ls->ls_create_count == 1) { @@ -730,7 +731,8 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_device_deregister(ls); - if (force != 3 && dlm_user_daemon_available()) + if (release_option != DLM_RELEASE_NO_EVENT && + dlm_user_daemon_available()) do_uevent(ls, 0); dlm_recoverd_stop(ls); @@ -782,11 +784,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) * lockspace must continue to function as usual, participating in recoveries, * until this returns. * - * Force has 4 possible values: - * 0 - don't destroy lockspace if it has any LKBs - * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs - * 2 - destroy lockspace regardless of LKBs - * 3 - destroy lockspace as part of a forced shutdown + * See DLM_RELEASE defines for release_option values and their meaning. */ int dlm_release_lockspace(void *lockspace, int force) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 5cb3896be826..51daf4acbe31 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -425,7 +425,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params) dlm_put_lockspace(ls); if (error) - dlm_release_lockspace(lockspace, 0); + dlm_release_lockspace(lockspace, DLM_RELEASE_NO_LOCKS); else error = ls->ls_device.minor; @@ -436,7 +436,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) { dlm_lockspace_t *lockspace; struct dlm_ls *ls; - int error, force = 0; + int error, force = DLM_RELEASE_NO_LOCKS; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -446,7 +446,7 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) return -ENOENT; if (params->flags & DLM_USER_LSFLG_FORCEFREE) - force = 2; + force = DLM_RELEASE_NORMAL; lockspace = ls; dlm_put_lockspace(ls); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index cee5d199d2d8..aac4dd6d0381 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -1400,7 +1400,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) return 0; fail_release: - dlm_release_lockspace(ls->ls_dlm, 2); + dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL); fail_free: free_recover_size(ls); fail: @@ -1437,7 +1437,7 @@ static void gdlm_unmount(struct gfs2_sbd *sdp) /* mounted_lock and control_lock will be purged in dlm recovery */ release: if (ls->ls_dlm) { - dlm_release_lockspace(ls->ls_dlm, 2); + dlm_release_lockspace(ls->ls_dlm, DLM_RELEASE_NORMAL); ls->ls_dlm = NULL; } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 0f045e45fa0c..765105f1ff8a 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -952,7 +952,7 @@ static const struct dlm_lockspace_ops ocfs2_ls_ops = { static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn) { version_unlock(conn); - dlm_release_lockspace(conn->cc_lockspace, 2); + dlm_release_lockspace(conn->cc_lockspace, DLM_RELEASE_NORMAL); conn->cc_lockspace = NULL; ocfs2_live_connection_drop(conn->cc_private); conn->cc_private = NULL; diff --git a/include/linux/dlm.h b/include/linux/dlm.h index bacda9898f2b..cc7a36244893 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -87,13 +87,37 @@ int dlm_new_lockspace(const char *name, const char *cluster, const struct dlm_lockspace_ops *ops, void *ops_arg, int *ops_result, dlm_lockspace_t **lockspace); +/* + * dlm_release_lockspace() release_option values: + * + * DLM_RELEASE_NO_LOCKS returns -EBUSY if any locks (lkb's) + * exist in the local lockspace. + * + * DLM_RELEASE_UNUSED previous value that is no longer used. + * + * DLM_RELEASE_NORMAL releases the lockspace regardless of any + * locks managed in the local lockspace. + * + * DLM_RELEASE_NO_EVENT release the lockspace regardless of any + * locks managed in the local lockspace, and does not submit + * a leave event to the cluster manager, so other nodes will + * not be notified that the node should be removed from the + * list of lockspace members. + */ +#define DLM_RELEASE_NO_LOCKS 0 +#define DLM_RELEASE_UNUSED 1 +#define DLM_RELEASE_NORMAL 2 +#define DLM_RELEASE_NO_EVENT 3 + /* * dlm_release_lockspace * * Stop a lockspace. + * + * release_option: see DLM_RELEASE values above. */ -int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force); +int dlm_release_lockspace(dlm_lockspace_t *lockspace, int release_option); /* * dlm_lock -- cgit v1.2.3 From 6f4f4ca5caf73de5e86329547d4527b3e0c08488 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 23 Jul 2025 11:21:56 -0400 Subject: dlm: add new flag DLM_RELEASE_RECOVER for dlm_lockspace_release When dlm_lockspace_release() is passed DLM_RELEASE_RECOVER, it tells the dlm to handle the release/leave as if the node had failed, i.e. perform recovery steps for a failed node, like recover_slot(). When DLM_RELEASE_RECOVER is set: - dlm_release_lockspace() includes RELEASE_RECOVER=1 in the OFFLINE uevent sent to userspace. - userspace/dlm_controld sends a message to all lockspace members indicating that the subsequent node removal should be handled as if the node had failed. - when dlm_controld on all nodes receives the new message, it sets the release_recover configfs entry to 1 for the node. - when the dlm/kernel next performs recovery and removes the node, it will see that release_recover has been set, and will perform recovery steps for the node as if it had failed, e.g. the recover_slot() callback is called to notify the fs. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 2 +- fs/dlm/member.c | 14 ++++++++++---- include/linux/dlm.h | 5 +++++ 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 6ff666a511c7..d986b7ef153d 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -738,7 +738,7 @@ static int release_lockspace(struct dlm_ls *ls, int release_option) if (release_option != DLM_RELEASE_NO_EVENT && dlm_user_daemon_available()) - do_uevent(ls, 0, 0); + do_uevent(ls, 0, (release_option == DLM_RELEASE_RECOVER)); dlm_recoverd_stop(ls); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 152d2cb16f59..356337102015 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -478,7 +478,8 @@ static void dlm_lsop_recover_prep(struct dlm_ls *ls) ls->ls_ops->recover_prep(ls->ls_ops_arg); } -static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) +static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb, + unsigned int release_recover) { struct dlm_slot slot; uint32_t seq; @@ -495,7 +496,7 @@ static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb) error = dlm_comm_seq(memb->nodeid, &seq, false); - if (!error && seq == memb->comm_seq) + if (!release_recover && !error && seq == memb->comm_seq) return; slot.nodeid = memb->nodeid; @@ -552,6 +553,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) struct dlm_member *memb, *safe; struct dlm_config_node *node; int i, error, neg = 0, low = -1; + unsigned int release_recover; /* previously removed members that we've not finished removing need to * count as a negative change so the "neg" recovery steps will happen @@ -572,8 +574,12 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) if (node && !node->new && !node->gone) continue; + release_recover = 0; + if (node->gone) { - log_rinfo(ls, "remove member %d", memb->nodeid); + release_recover = node->release_recover; + log_rinfo(ls, "remove member %d%s", memb->nodeid, + release_recover ? " (release_recover)" : ""); } else { /* removed and re-added */ log_rinfo(ls, "remove member %d comm_seq %u %u", @@ -584,7 +590,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) list_move(&memb->list, &ls->ls_nodes_gone); remove_remote_member(memb->nodeid); ls->ls_num_nodes--; - dlm_lsop_recover_slot(ls, memb); + dlm_lsop_recover_slot(ls, memb, release_recover); } /* add new members to ls_nodes */ diff --git a/include/linux/dlm.h b/include/linux/dlm.h index cc7a36244893..108eb953eb18 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -103,11 +103,16 @@ int dlm_new_lockspace(const char *name, const char *cluster, * a leave event to the cluster manager, so other nodes will * not be notified that the node should be removed from the * list of lockspace members. + * + * DLM_RELEASE_RECOVER like DLM_RELEASE_NORMAL, but the remaining + * nodes will handle the removal of the node as if the node + * had failed, e.g. the recover_slot() callback would be used. */ #define DLM_RELEASE_NO_LOCKS 0 #define DLM_RELEASE_UNUSED 1 #define DLM_RELEASE_NORMAL 2 #define DLM_RELEASE_NO_EVENT 3 +#define DLM_RELEASE_RECOVER 4 /* * dlm_release_lockspace -- cgit v1.2.3 From 534b9bdeb4b80d843ca9f924524d4d103ad6605e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 29 Jun 2025 22:52:45 -0700 Subject: Input: tca6416-keypad - remove the driver This input driver predates proper GPIO driver for the chip and now can be replaced with the generic gpio-keys. Remove the driver. Link: https://lore.kernel.org/r/ajfsei3keh4jjasd4lshjicgqixew7bak3cmty3suoliskzgz4@vj3ijycfxy4i Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/Kconfig | 18 -- drivers/input/keyboard/Makefile | 1 - drivers/input/keyboard/tca6416-keypad.c | 305 -------------------------------- include/linux/tca6416_keypad.h | 30 ---- 4 files changed, 354 deletions(-) delete mode 100644 drivers/input/keyboard/tca6416-keypad.c delete mode 100644 include/linux/tca6416_keypad.h (limited to 'include/linux') diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 7c4f309a4cb6..f3474026d192 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -262,24 +262,6 @@ config KEYBOARD_GPIO_POLLED To compile this driver as a module, choose M here: the module will be called gpio_keys_polled. -config KEYBOARD_TCA6416 - tristate "TCA6416/TCA6408A Keypad Support" - depends on I2C - help - This driver implements basic keypad functionality - for keys connected through TCA6416/TCA6408A IO expanders. - - Say Y here if your device has keys connected to - TCA6416/TCA6408A IO expander. Your board-specific setup logic - must also provide pin-mask details(of which TCA6416 pins - are used for keypad). - - If enabled the entire TCA6416 device will be managed through - this driver. - - To compile this driver as a module, choose M here: the - module will be called tca6416_keypad. - config KEYBOARD_TCA8418 tristate "TCA8418 Keypad Support" depends on I2C diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile index 8bc20ab2b103..c0134da14580 100644 --- a/drivers/input/keyboard/Makefile +++ b/drivers/input/keyboard/Makefile @@ -23,7 +23,6 @@ obj-$(CONFIG_KEYBOARD_EP93XX) += ep93xx_keypad.o obj-$(CONFIG_KEYBOARD_GOLDFISH_EVENTS) += goldfish_events.o obj-$(CONFIG_KEYBOARD_GPIO) += gpio_keys.o obj-$(CONFIG_KEYBOARD_GPIO_POLLED) += gpio_keys_polled.o -obj-$(CONFIG_KEYBOARD_TCA6416) += tca6416-keypad.o obj-$(CONFIG_KEYBOARD_TCA8418) += tca8418_keypad.o obj-$(CONFIG_KEYBOARD_HIL) += hil_kbd.o obj-$(CONFIG_KEYBOARD_HIL_OLD) += hilkbd.o diff --git a/drivers/input/keyboard/tca6416-keypad.c b/drivers/input/keyboard/tca6416-keypad.c deleted file mode 100644 index fbc674d7b9f0..000000000000 --- a/drivers/input/keyboard/tca6416-keypad.c +++ /dev/null @@ -1,305 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Driver for keys on TCA6416 I2C IO expander - * - * Copyright (C) 2010 Texas Instruments - * - * Author : Sriramakrishnan.A.G. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define TCA6416_INPUT 0 -#define TCA6416_OUTPUT 1 -#define TCA6416_INVERT 2 -#define TCA6416_DIRECTION 3 - -#define TCA6416_POLL_INTERVAL 100 /* msec */ - -static const struct i2c_device_id tca6416_id[] = { - { "tca6416-keys", 16, }, - { "tca6408-keys", 8, }, - { } -}; -MODULE_DEVICE_TABLE(i2c, tca6416_id); - -struct tca6416_keypad_chip { - uint16_t reg_output; - uint16_t reg_direction; - uint16_t reg_input; - - struct i2c_client *client; - struct input_dev *input; - int io_size; - u16 pinmask; - bool use_polling; - struct tca6416_button buttons[]; -}; - -static int tca6416_write_reg(struct tca6416_keypad_chip *chip, int reg, u16 val) -{ - int error; - - error = chip->io_size > 8 ? - i2c_smbus_write_word_data(chip->client, reg << 1, val) : - i2c_smbus_write_byte_data(chip->client, reg, val); - if (error < 0) { - dev_err(&chip->client->dev, - "%s failed, reg: %d, val: %d, error: %d\n", - __func__, reg, val, error); - return error; - } - - return 0; -} - -static int tca6416_read_reg(struct tca6416_keypad_chip *chip, int reg, u16 *val) -{ - int retval; - - retval = chip->io_size > 8 ? - i2c_smbus_read_word_data(chip->client, reg << 1) : - i2c_smbus_read_byte_data(chip->client, reg); - if (retval < 0) { - dev_err(&chip->client->dev, "%s failed, reg: %d, error: %d\n", - __func__, reg, retval); - return retval; - } - - *val = (u16)retval; - return 0; -} - -static void tca6416_keys_scan(struct input_dev *input) -{ - struct tca6416_keypad_chip *chip = input_get_drvdata(input); - u16 reg_val, val; - int error, i, pin_index; - - error = tca6416_read_reg(chip, TCA6416_INPUT, ®_val); - if (error) - return; - - reg_val &= chip->pinmask; - - /* Figure out which lines have changed */ - val = reg_val ^ chip->reg_input; - chip->reg_input = reg_val; - - for (i = 0, pin_index = 0; i < 16; i++) { - if (val & (1 << i)) { - struct tca6416_button *button = &chip->buttons[pin_index]; - unsigned int type = button->type ?: EV_KEY; - int state = ((reg_val & (1 << i)) ? 1 : 0) - ^ button->active_low; - - input_event(input, type, button->code, !!state); - input_sync(input); - } - - if (chip->pinmask & (1 << i)) - pin_index++; - } -} - -/* - * This is threaded IRQ handler and this can (and will) sleep. - */ -static irqreturn_t tca6416_keys_isr(int irq, void *dev_id) -{ - tca6416_keys_scan(dev_id); - - return IRQ_HANDLED; -} - -static int tca6416_keys_open(struct input_dev *dev) -{ - struct tca6416_keypad_chip *chip = input_get_drvdata(dev); - - if (!chip->use_polling) { - /* Get initial device state in case it has switches */ - tca6416_keys_scan(dev); - enable_irq(chip->client->irq); - } - - return 0; -} - -static void tca6416_keys_close(struct input_dev *dev) -{ - struct tca6416_keypad_chip *chip = input_get_drvdata(dev); - - if (!chip->use_polling) - disable_irq(chip->client->irq); -} - -static int tca6416_setup_registers(struct tca6416_keypad_chip *chip) -{ - int error; - - error = tca6416_read_reg(chip, TCA6416_OUTPUT, &chip->reg_output); - if (error) - return error; - - error = tca6416_read_reg(chip, TCA6416_DIRECTION, &chip->reg_direction); - if (error) - return error; - - /* ensure that keypad pins are set to input */ - error = tca6416_write_reg(chip, TCA6416_DIRECTION, - chip->reg_direction | chip->pinmask); - if (error) - return error; - - error = tca6416_read_reg(chip, TCA6416_DIRECTION, &chip->reg_direction); - if (error) - return error; - - error = tca6416_read_reg(chip, TCA6416_INPUT, &chip->reg_input); - if (error) - return error; - - chip->reg_input &= chip->pinmask; - - return 0; -} - -static int tca6416_keypad_probe(struct i2c_client *client) -{ - const struct i2c_device_id *id = i2c_client_get_device_id(client); - struct tca6416_keys_platform_data *pdata; - struct tca6416_keypad_chip *chip; - struct input_dev *input; - int error; - int i; - - /* Check functionality */ - if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE)) { - dev_err(&client->dev, "%s adapter not supported\n", - dev_driver_string(&client->adapter->dev)); - return -ENODEV; - } - - pdata = dev_get_platdata(&client->dev); - if (!pdata) { - dev_dbg(&client->dev, "no platform data\n"); - return -EINVAL; - } - - chip = devm_kzalloc(&client->dev, - struct_size(chip, buttons, pdata->nbuttons), - GFP_KERNEL); - if (!chip) - return -ENOMEM; - - input = devm_input_allocate_device(&client->dev); - if (!input) - return -ENOMEM; - - chip->client = client; - chip->input = input; - chip->io_size = id->driver_data; - chip->pinmask = pdata->pinmask; - chip->use_polling = pdata->use_polling; - - input->phys = "tca6416-keys/input0"; - input->name = client->name; - - input->open = tca6416_keys_open; - input->close = tca6416_keys_close; - - input->id.bustype = BUS_HOST; - input->id.vendor = 0x0001; - input->id.product = 0x0001; - input->id.version = 0x0100; - - /* Enable auto repeat feature of Linux input subsystem */ - if (pdata->rep) - __set_bit(EV_REP, input->evbit); - - for (i = 0; i < pdata->nbuttons; i++) { - unsigned int type; - - chip->buttons[i] = pdata->buttons[i]; - type = (pdata->buttons[i].type) ?: EV_KEY; - input_set_capability(input, type, pdata->buttons[i].code); - } - - input_set_drvdata(input, chip); - - /* - * Initialize cached registers from their original values. - * we can't share this chip with another i2c master. - */ - error = tca6416_setup_registers(chip); - if (error) - return error; - - if (chip->use_polling) { - error = input_setup_polling(input, tca6416_keys_scan); - if (error) { - dev_err(&client->dev, "Failed to setup polling\n"); - return error; - } - - input_set_poll_interval(input, TCA6416_POLL_INTERVAL); - } else { - error = devm_request_threaded_irq(&client->dev, client->irq, - NULL, tca6416_keys_isr, - IRQF_TRIGGER_FALLING | - IRQF_ONESHOT | - IRQF_NO_AUTOEN, - "tca6416-keypad", input); - if (error) { - dev_dbg(&client->dev, - "Unable to claim irq %d; error %d\n", - client->irq, error); - return error; - } - } - - error = input_register_device(input); - if (error) { - dev_dbg(&client->dev, - "Unable to register input device, error: %d\n", error); - return error; - } - - i2c_set_clientdata(client, chip); - - return 0; -} - -static struct i2c_driver tca6416_keypad_driver = { - .driver = { - .name = "tca6416-keypad", - }, - .probe = tca6416_keypad_probe, - .id_table = tca6416_id, -}; - -static int __init tca6416_keypad_init(void) -{ - return i2c_add_driver(&tca6416_keypad_driver); -} - -subsys_initcall(tca6416_keypad_init); - -static void __exit tca6416_keypad_exit(void) -{ - i2c_del_driver(&tca6416_keypad_driver); -} -module_exit(tca6416_keypad_exit); - -MODULE_AUTHOR("Sriramakrishnan "); -MODULE_DESCRIPTION("Keypad driver over tca6416 IO expander"); -MODULE_LICENSE("GPL"); diff --git a/include/linux/tca6416_keypad.h b/include/linux/tca6416_keypad.h deleted file mode 100644 index 5cf6f6f82aa7..000000000000 --- a/include/linux/tca6416_keypad.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tca6416 keypad platform support - * - * Copyright (C) 2010 Texas Instruments - * - * Author: Sriramakrishnan - */ - -#ifndef _TCA6416_KEYS_H -#define _TCA6416_KEYS_H - -#include - -struct tca6416_button { - /* Configuration parameters */ - int code; /* input event code (KEY_*, SW_*) */ - int active_low; - int type; /* input event type (EV_KEY, EV_SW) */ -}; - -struct tca6416_keys_platform_data { - struct tca6416_button *buttons; - int nbuttons; - unsigned int rep:1; /* enable input subsystem auto repeat */ - uint16_t pinmask; - uint16_t invert; - int use_polling; /* use polling if Interrupt is not connected*/ -}; -#endif -- cgit v1.2.3 From c93c59baa5ab57e94b874000cec56e26611b7a23 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 11 Aug 2025 20:58:20 +0200 Subject: bpf: Tidy verifier bug message Yonghong noticed that error messages for potential verifier bugs often have a '(1)' at the end. This is happening because verifier_bug_if(cond, env, fmt, args...) prints "(" #cond ")\n" as part of the message and verifier_bug() is defined as: #define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) Hence, verifier_bug() always ends up displaying '(1)'. This small patch fixes it by having verifier_bug_if conditionally call verifier_bug instead of the other way around. Fixes: 1cb0f56d9618 ("bpf: WARN_ONCE on verifier bugs") Reported-by: Yonghong Song Signed-off-by: Paul Chaignon Signed-off-by: Andrii Nakryiko Tested-by: Eduard Zingerman Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/aJo9THBrzo8jFXsh@mail.gmail.com --- include/linux/bpf_verifier.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c823f8efe3ed..020de62bd09c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -875,13 +875,15 @@ __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, #define verifier_bug_if(cond, env, fmt, args...) \ ({ \ bool __cond = (cond); \ - if (unlikely(__cond)) { \ - BPF_WARN_ONCE(1, "verifier bug: " fmt "(" #cond ")\n", ##args); \ - bpf_log(&env->log, "verifier bug: " fmt "(" #cond ")\n", ##args); \ - } \ + if (unlikely(__cond)) \ + verifier_bug(env, fmt " (" #cond ")", ##args); \ (__cond); \ }) -#define verifier_bug(env, fmt, args...) verifier_bug_if(1, env, fmt, ##args) +#define verifier_bug(env, fmt, args...) \ + ({ \ + BPF_WARN_ONCE(1, "verifier bug: " fmt "\n", ##args); \ + bpf_log(&env->log, "verifier bug: " fmt "\n", ##args); \ + }) static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { -- cgit v1.2.3 From 07bbbfe7addf5b032e04f3c38f0b183d067a3f0d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 11 Aug 2025 19:50:43 +0100 Subject: net: stmmac: add suspend()/resume() platform ops Add suspend/resume platform operations, which, when populated, override the init/exit platform operations when we suspend and resume. These suspend()/resume() methods are called by core code, and thus are designed to support any struct device, not just platform devices. This allows them to be used by the PCI drivers we have. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/E1ulXbX-008gqZ-Bb@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 9 +++++++++ drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 12 ++++++++---- include/linux/stmmac.h | 2 ++ 3 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f1abf4242cd2..2da4f7bb2899 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7879,6 +7879,9 @@ int stmmac_suspend(struct device *dev) if (stmmac_fpe_supported(priv)) ethtool_mmsv_stop(&priv->fpe_cfg.mmsv); + if (priv->plat->suspend) + return priv->plat->suspend(dev, priv->plat->bsp_priv); + return 0; } EXPORT_SYMBOL_GPL(stmmac_suspend); @@ -7931,6 +7934,12 @@ int stmmac_resume(struct device *dev) struct stmmac_priv *priv = netdev_priv(ndev); int ret; + if (priv->plat->resume) { + ret = priv->plat->resume(dev, priv->plat->bsp_priv); + if (ret) + return ret; + } + if (!netif_running(ndev)) return 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 030fcf1b5993..21df052eeed0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -901,7 +901,9 @@ static int __maybe_unused stmmac_pltfr_suspend(struct device *dev) struct platform_device *pdev = to_platform_device(dev); ret = stmmac_suspend(dev); - stmmac_pltfr_exit(pdev, priv->plat); + + if (!priv->plat->suspend) + stmmac_pltfr_exit(pdev, priv->plat); return ret; } @@ -920,9 +922,11 @@ static int __maybe_unused stmmac_pltfr_resume(struct device *dev) struct platform_device *pdev = to_platform_device(dev); int ret; - ret = stmmac_pltfr_init(pdev, priv->plat); - if (ret) - return ret; + if (!priv->plat->resume) { + ret = stmmac_pltfr_init(pdev, priv->plat); + if (ret) + return ret; + } return stmmac_resume(dev); } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 26ddf95d23f9..22c24dacbc65 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -248,6 +248,8 @@ struct plat_stmmacenet_data { void (*ptp_clk_freq_config)(struct stmmac_priv *priv); int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); + int (*suspend)(struct device *dev, void *priv); + int (*resume)(struct device *dev, void *priv); struct mac_device_info *(*setup)(void *priv); int (*clks_config)(void *priv, bool enabled); int (*crosststamp)(ktime_t *device, struct system_counterval_t *system, -- cgit v1.2.3 From b3ef7bdec66fb1813e865fd39d179a93cefd2015 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 11 Aug 2025 17:31:42 +0200 Subject: net: airoha: Add airoha_offload.h header Move NPU definitions to airoha_offload.h in include/linux/soc/airoha/ in order to allow the MT76 driver to access the callback definitions. Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250811-airoha-en7581-wlan-offlaod-v7-7-58823603bb4e@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_npu.c | 2 +- drivers/net/ethernet/airoha/airoha_npu.h | 103 ------------ drivers/net/ethernet/airoha/airoha_ppe.c | 2 +- include/linux/soc/airoha/airoha_offload.h | 260 ++++++++++++++++++++++++++++++ 4 files changed, 262 insertions(+), 105 deletions(-) delete mode 100644 drivers/net/ethernet/airoha/airoha_npu.h create mode 100644 include/linux/soc/airoha/airoha_offload.h (limited to 'include/linux') diff --git a/drivers/net/ethernet/airoha/airoha_npu.c b/drivers/net/ethernet/airoha/airoha_npu.c index 66a8a992dbf2..1a6b191ae0b0 100644 --- a/drivers/net/ethernet/airoha/airoha_npu.c +++ b/drivers/net/ethernet/airoha/airoha_npu.c @@ -11,9 +11,9 @@ #include #include #include +#include #include "airoha_eth.h" -#include "airoha_npu.h" #define NPU_EN7581_FIRMWARE_DATA "airoha/en7581_npu_data.bin" #define NPU_EN7581_FIRMWARE_RV32 "airoha/en7581_npu_rv32.bin" diff --git a/drivers/net/ethernet/airoha/airoha_npu.h b/drivers/net/ethernet/airoha/airoha_npu.h deleted file mode 100644 index a448c74208a9..000000000000 --- a/drivers/net/ethernet/airoha/airoha_npu.h +++ /dev/null @@ -1,103 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2025 AIROHA Inc - * Author: Lorenzo Bianconi - */ - -#define NPU_NUM_CORES 8 -#define NPU_NUM_IRQ 6 - -enum airoha_npu_wlan_set_cmd { - WLAN_FUNC_SET_WAIT_PCIE_ADDR, - WLAN_FUNC_SET_WAIT_DESC, - WLAN_FUNC_SET_WAIT_NPU_INIT_DONE, - WLAN_FUNC_SET_WAIT_TRAN_TO_CPU, - WLAN_FUNC_SET_WAIT_BA_WIN_SIZE, - WLAN_FUNC_SET_WAIT_DRIVER_MODEL, - WLAN_FUNC_SET_WAIT_DEL_STA, - WLAN_FUNC_SET_WAIT_DRAM_BA_NODE_ADDR, - WLAN_FUNC_SET_WAIT_PKT_BUF_ADDR, - WLAN_FUNC_SET_WAIT_IS_TEST_NOBA, - WLAN_FUNC_SET_WAIT_FLUSHONE_TIMEOUT, - WLAN_FUNC_SET_WAIT_FLUSHALL_TIMEOUT, - WLAN_FUNC_SET_WAIT_IS_FORCE_TO_CPU, - WLAN_FUNC_SET_WAIT_PCIE_STATE, - WLAN_FUNC_SET_WAIT_PCIE_PORT_TYPE, - WLAN_FUNC_SET_WAIT_ERROR_RETRY_TIMES, - WLAN_FUNC_SET_WAIT_BAR_INFO, - WLAN_FUNC_SET_WAIT_FAST_FLAG, - WLAN_FUNC_SET_WAIT_NPU_BAND0_ONCPU, - WLAN_FUNC_SET_WAIT_TX_RING_PCIE_ADDR, - WLAN_FUNC_SET_WAIT_TX_DESC_HW_BASE, - WLAN_FUNC_SET_WAIT_TX_BUF_SPACE_HW_BASE, - WLAN_FUNC_SET_WAIT_RX_RING_FOR_TXDONE_HW_BASE, - WLAN_FUNC_SET_WAIT_TX_PKT_BUF_ADDR, - WLAN_FUNC_SET_WAIT_INODE_TXRX_REG_ADDR, - WLAN_FUNC_SET_WAIT_INODE_DEBUG_FLAG, - WLAN_FUNC_SET_WAIT_INODE_HW_CFG_INFO, - WLAN_FUNC_SET_WAIT_INODE_STOP_ACTION, - WLAN_FUNC_SET_WAIT_INODE_PCIE_SWAP, - WLAN_FUNC_SET_WAIT_RATELIMIT_CTRL, - WLAN_FUNC_SET_WAIT_HWNAT_INIT, - WLAN_FUNC_SET_WAIT_ARHT_CHIP_INFO, - WLAN_FUNC_SET_WAIT_TX_BUF_CHECK_ADDR, - WLAN_FUNC_SET_WAIT_TOKEN_ID_SIZE, -}; - -enum airoha_npu_wlan_get_cmd { - WLAN_FUNC_GET_WAIT_NPU_INFO, - WLAN_FUNC_GET_WAIT_LAST_RATE, - WLAN_FUNC_GET_WAIT_COUNTER, - WLAN_FUNC_GET_WAIT_DBG_COUNTER, - WLAN_FUNC_GET_WAIT_RXDESC_BASE, - WLAN_FUNC_GET_WAIT_WCID_DBG_COUNTER, - WLAN_FUNC_GET_WAIT_DMA_ADDR, - WLAN_FUNC_GET_WAIT_RING_SIZE, - WLAN_FUNC_GET_WAIT_NPU_SUPPORT_MAP, - WLAN_FUNC_GET_WAIT_MDC_LOCK_ADDRESS, - WLAN_FUNC_GET_WAIT_NPU_VERSION, -}; - -struct airoha_npu { - struct device *dev; - struct regmap *regmap; - - struct airoha_npu_core { - struct airoha_npu *npu; - /* protect concurrent npu memory accesses */ - spinlock_t lock; - struct work_struct wdt_work; - } cores[NPU_NUM_CORES]; - - int irqs[NPU_NUM_IRQ]; - - struct airoha_foe_stats __iomem *stats; - - struct { - int (*ppe_init)(struct airoha_npu *npu); - int (*ppe_deinit)(struct airoha_npu *npu); - int (*ppe_flush_sram_entries)(struct airoha_npu *npu, - dma_addr_t foe_addr, - int sram_num_entries); - int (*ppe_foe_commit_entry)(struct airoha_npu *npu, - dma_addr_t foe_addr, - u32 entry_size, u32 hash, - bool ppe2); - int (*wlan_init_reserved_memory)(struct airoha_npu *npu); - int (*wlan_send_msg)(struct airoha_npu *npu, int ifindex, - enum airoha_npu_wlan_set_cmd func_id, - void *data, int data_len, gfp_t gfp); - int (*wlan_get_msg)(struct airoha_npu *npu, int ifindex, - enum airoha_npu_wlan_get_cmd func_id, - void *data, int data_len, gfp_t gfp); - u32 (*wlan_get_queue_addr)(struct airoha_npu *npu, int qid, - bool xmit); - void (*wlan_set_irq_status)(struct airoha_npu *npu, u32 val); - u32 (*wlan_get_irq_status)(struct airoha_npu *npu, int q); - void (*wlan_enable_irq)(struct airoha_npu *npu, int q); - void (*wlan_disable_irq)(struct airoha_npu *npu, int q); - } ops; -}; - -struct airoha_npu *airoha_npu_get(struct device *dev, dma_addr_t *stats_addr); -void airoha_npu_put(struct airoha_npu *npu); diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 47411d2cbd28..82163392332c 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -7,10 +7,10 @@ #include #include #include +#include #include #include -#include "airoha_npu.h" #include "airoha_regs.h" #include "airoha_eth.h" diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h new file mode 100644 index 000000000000..117c63c2448d --- /dev/null +++ b/include/linux/soc/airoha/airoha_offload.h @@ -0,0 +1,260 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 AIROHA Inc + * Author: Lorenzo Bianconi + */ +#ifndef AIROHA_OFFLOAD_H +#define AIROHA_OFFLOAD_H + +#include +#include + +#define NPU_NUM_CORES 8 +#define NPU_NUM_IRQ 6 +#define NPU_RX0_DESC_NUM 512 +#define NPU_RX1_DESC_NUM 512 + +/* CTRL */ +#define NPU_RX_DMA_DESC_LAST_MASK BIT(29) +#define NPU_RX_DMA_DESC_LEN_MASK GENMASK(28, 15) +#define NPU_RX_DMA_DESC_CUR_LEN_MASK GENMASK(14, 1) +#define NPU_RX_DMA_DESC_DONE_MASK BIT(0) +/* INFO */ +#define NPU_RX_DMA_PKT_COUNT_MASK GENMASK(31, 28) +#define NPU_RX_DMA_PKT_ID_MASK GENMASK(28, 26) +#define NPU_RX_DMA_SRC_PORT_MASK GENMASK(25, 21) +#define NPU_RX_DMA_CRSN_MASK GENMASK(20, 16) +#define NPU_RX_DMA_FOE_ID_MASK GENMASK(15, 0) +/* DATA */ +#define NPU_RX_DMA_SID_MASK GENMASK(31, 16) +#define NPU_RX_DMA_FRAG_TYPE_MASK GENMASK(15, 14) +#define NPU_RX_DMA_PRIORITY_MASK GENMASK(13, 10) +#define NPU_RX_DMA_RADIO_ID_MASK GENMASK(9, 6) +#define NPU_RX_DMA_VAP_ID_MASK GENMASK(5, 2) +#define NPU_RX_DMA_FRAME_TYPE_MASK GENMASK(1, 0) + +struct airoha_npu_rx_dma_desc { + u32 ctrl; + u32 info; + u32 data; + u32 addr; + u64 rsv; +} __packed; + +/* CTRL */ +#define NPU_TX_DMA_DESC_SCHED_MASK BIT(31) +#define NPU_TX_DMA_DESC_LEN_MASK GENMASK(30, 18) +#define NPU_TX_DMA_DESC_VEND_LEN_MASK GENMASK(17, 1) +#define NPU_TX_DMA_DESC_DONE_MASK BIT(0) + +#define NPU_TXWI_LEN 192 + +struct airoha_npu_tx_dma_desc { + u32 ctrl; + u32 addr; + u64 rsv; + u8 txwi[NPU_TXWI_LEN]; +} __packed; + +enum airoha_npu_wlan_set_cmd { + WLAN_FUNC_SET_WAIT_PCIE_ADDR, + WLAN_FUNC_SET_WAIT_DESC, + WLAN_FUNC_SET_WAIT_NPU_INIT_DONE, + WLAN_FUNC_SET_WAIT_TRAN_TO_CPU, + WLAN_FUNC_SET_WAIT_BA_WIN_SIZE, + WLAN_FUNC_SET_WAIT_DRIVER_MODEL, + WLAN_FUNC_SET_WAIT_DEL_STA, + WLAN_FUNC_SET_WAIT_DRAM_BA_NODE_ADDR, + WLAN_FUNC_SET_WAIT_PKT_BUF_ADDR, + WLAN_FUNC_SET_WAIT_IS_TEST_NOBA, + WLAN_FUNC_SET_WAIT_FLUSHONE_TIMEOUT, + WLAN_FUNC_SET_WAIT_FLUSHALL_TIMEOUT, + WLAN_FUNC_SET_WAIT_IS_FORCE_TO_CPU, + WLAN_FUNC_SET_WAIT_PCIE_STATE, + WLAN_FUNC_SET_WAIT_PCIE_PORT_TYPE, + WLAN_FUNC_SET_WAIT_ERROR_RETRY_TIMES, + WLAN_FUNC_SET_WAIT_BAR_INFO, + WLAN_FUNC_SET_WAIT_FAST_FLAG, + WLAN_FUNC_SET_WAIT_NPU_BAND0_ONCPU, + WLAN_FUNC_SET_WAIT_TX_RING_PCIE_ADDR, + WLAN_FUNC_SET_WAIT_TX_DESC_HW_BASE, + WLAN_FUNC_SET_WAIT_TX_BUF_SPACE_HW_BASE, + WLAN_FUNC_SET_WAIT_RX_RING_FOR_TXDONE_HW_BASE, + WLAN_FUNC_SET_WAIT_TX_PKT_BUF_ADDR, + WLAN_FUNC_SET_WAIT_INODE_TXRX_REG_ADDR, + WLAN_FUNC_SET_WAIT_INODE_DEBUG_FLAG, + WLAN_FUNC_SET_WAIT_INODE_HW_CFG_INFO, + WLAN_FUNC_SET_WAIT_INODE_STOP_ACTION, + WLAN_FUNC_SET_WAIT_INODE_PCIE_SWAP, + WLAN_FUNC_SET_WAIT_RATELIMIT_CTRL, + WLAN_FUNC_SET_WAIT_HWNAT_INIT, + WLAN_FUNC_SET_WAIT_ARHT_CHIP_INFO, + WLAN_FUNC_SET_WAIT_TX_BUF_CHECK_ADDR, + WLAN_FUNC_SET_WAIT_TOKEN_ID_SIZE, +}; + +enum airoha_npu_wlan_get_cmd { + WLAN_FUNC_GET_WAIT_NPU_INFO, + WLAN_FUNC_GET_WAIT_LAST_RATE, + WLAN_FUNC_GET_WAIT_COUNTER, + WLAN_FUNC_GET_WAIT_DBG_COUNTER, + WLAN_FUNC_GET_WAIT_RXDESC_BASE, + WLAN_FUNC_GET_WAIT_WCID_DBG_COUNTER, + WLAN_FUNC_GET_WAIT_DMA_ADDR, + WLAN_FUNC_GET_WAIT_RING_SIZE, + WLAN_FUNC_GET_WAIT_NPU_SUPPORT_MAP, + WLAN_FUNC_GET_WAIT_MDC_LOCK_ADDRESS, + WLAN_FUNC_GET_WAIT_NPU_VERSION, +}; + +struct airoha_npu { +#if (IS_BUILTIN(CONFIG_NET_AIROHA_NPU) || IS_MODULE(CONFIG_NET_AIROHA_NPU)) + struct device *dev; + struct regmap *regmap; + + struct airoha_npu_core { + struct airoha_npu *npu; + /* protect concurrent npu memory accesses */ + spinlock_t lock; + struct work_struct wdt_work; + } cores[NPU_NUM_CORES]; + + int irqs[NPU_NUM_IRQ]; + + struct airoha_foe_stats __iomem *stats; + + struct { + int (*ppe_init)(struct airoha_npu *npu); + int (*ppe_deinit)(struct airoha_npu *npu); + int (*ppe_flush_sram_entries)(struct airoha_npu *npu, + dma_addr_t foe_addr, + int sram_num_entries); + int (*ppe_foe_commit_entry)(struct airoha_npu *npu, + dma_addr_t foe_addr, + u32 entry_size, u32 hash, + bool ppe2); + int (*wlan_init_reserved_memory)(struct airoha_npu *npu); + int (*wlan_send_msg)(struct airoha_npu *npu, int ifindex, + enum airoha_npu_wlan_set_cmd func_id, + void *data, int data_len, gfp_t gfp); + int (*wlan_get_msg)(struct airoha_npu *npu, int ifindex, + enum airoha_npu_wlan_get_cmd func_id, + void *data, int data_len, gfp_t gfp); + u32 (*wlan_get_queue_addr)(struct airoha_npu *npu, int qid, + bool xmit); + void (*wlan_set_irq_status)(struct airoha_npu *npu, u32 val); + u32 (*wlan_get_irq_status)(struct airoha_npu *npu, int q); + void (*wlan_enable_irq)(struct airoha_npu *npu, int q); + void (*wlan_disable_irq)(struct airoha_npu *npu, int q); + } ops; +#endif +}; + +#if (IS_BUILTIN(CONFIG_NET_AIROHA_NPU) || IS_MODULE(CONFIG_NET_AIROHA_NPU)) +struct airoha_npu *airoha_npu_get(struct device *dev, dma_addr_t *stats_addr); +void airoha_npu_put(struct airoha_npu *npu); + +static inline int airoha_npu_wlan_init_reserved_memory(struct airoha_npu *npu) +{ + return npu->ops.wlan_init_reserved_memory(npu); +} + +static inline int airoha_npu_wlan_send_msg(struct airoha_npu *npu, + int ifindex, + enum airoha_npu_wlan_set_cmd cmd, + void *data, int data_len, gfp_t gfp) +{ + return npu->ops.wlan_send_msg(npu, ifindex, cmd, data, data_len, gfp); +} + +static inline int airoha_npu_wlan_get_msg(struct airoha_npu *npu, int ifindex, + enum airoha_npu_wlan_get_cmd cmd, + void *data, int data_len, gfp_t gfp) +{ + return npu->ops.wlan_get_msg(npu, ifindex, cmd, data, data_len, gfp); +} + +static inline u32 airoha_npu_wlan_get_queue_addr(struct airoha_npu *npu, + int qid, bool xmit) +{ + return npu->ops.wlan_get_queue_addr(npu, qid, xmit); +} + +static inline void airoha_npu_wlan_set_irq_status(struct airoha_npu *npu, + u32 val) +{ + npu->ops.wlan_set_irq_status(npu, val); +} + +static inline u32 airoha_npu_wlan_get_irq_status(struct airoha_npu *npu, int q) +{ + return npu->ops.wlan_get_irq_status(npu, q); +} + +static inline void airoha_npu_wlan_enable_irq(struct airoha_npu *npu, int q) +{ + npu->ops.wlan_enable_irq(npu, q); +} + +static inline void airoha_npu_wlan_disable_irq(struct airoha_npu *npu, int q) +{ + npu->ops.wlan_disable_irq(npu, q); +} +#else +static inline struct airoha_npu *airoha_npu_get(struct device *dev, + dma_addr_t *foe_stats_addr) +{ + return NULL; +} + +static inline void airoha_npu_put(struct airoha_npu *npu) +{ +} + +static inline int airoha_npu_wlan_init_reserved_memory(struct airoha_npu *npu) +{ + return -EOPNOTSUPP; +} + +static inline int airoha_npu_wlan_send_msg(struct airoha_npu *npu, + int ifindex, + enum airoha_npu_wlan_set_cmd cmd, + void *data, int data_len, gfp_t gfp) +{ + return -EOPNOTSUPP; +} + +static inline int airoha_npu_wlan_get_msg(struct airoha_npu *npu, int ifindex, + enum airoha_npu_wlan_get_cmd cmd, + void *data, int data_len, gfp_t gfp) +{ + return -EOPNOTSUPP; +} + +static inline u32 airoha_npu_wlan_get_queue_addr(struct airoha_npu *npu, + int qid, bool xmit) +{ + return 0; +} + +static inline void airoha_npu_wlan_set_irq_status(struct airoha_npu *npu, + u32 val) +{ +} + +static inline u32 airoha_npu_wlan_get_irq_status(struct airoha_npu *npu, + int q) +{ + return 0; +} + +static inline void airoha_npu_wlan_enable_irq(struct airoha_npu *npu, int q) +{ +} + +static inline void airoha_npu_wlan_disable_irq(struct airoha_npu *npu, int q) +{ +} +#endif + +#endif /* AIROHA_OFFLOAD_H */ -- cgit v1.2.3 From 3eb50369c09efb0f668a7f568a7e6f7cf4194cde Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 May 2024 23:22:01 -0400 Subject: scsi: switch ->bios_param() to passing gendisk Instances are passed struct block_device *bdev argument; the only thing it is used for (if it's used in the first place) is bdev->bd_disk. Might as well pass that in the first place... Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Signed-off-by: Al Viro --- Documentation/scsi/scsi_mid_low_api.rst | 4 ++-- drivers/ata/libata-scsi.c | 4 ++-- drivers/message/fusion/mptscsih.c | 2 +- drivers/message/fusion/mptscsih.h | 2 +- drivers/scsi/3w-9xxx.c | 2 +- drivers/scsi/3w-sas.c | 2 +- drivers/scsi/3w-xxxx.c | 2 +- drivers/scsi/BusLogic.c | 4 ++-- drivers/scsi/BusLogic.h | 2 +- drivers/scsi/aacraid/linit.c | 6 +++--- drivers/scsi/advansys.c | 2 +- drivers/scsi/aha152x.c | 4 ++-- drivers/scsi/aha1542.c | 2 +- drivers/scsi/aha1740.c | 2 +- drivers/scsi/aic7xxx/aic79xx_osm.c | 4 ++-- drivers/scsi/aic7xxx/aic7xxx_osm.c | 4 ++-- drivers/scsi/arcmsr/arcmsr_hba.c | 6 +++--- drivers/scsi/atp870u.c | 2 +- drivers/scsi/fdomain.c | 4 ++-- drivers/scsi/imm.c | 2 +- drivers/scsi/initio.c | 4 ++-- drivers/scsi/ipr.c | 8 ++++---- drivers/scsi/ips.c | 2 +- drivers/scsi/ips.h | 2 +- drivers/scsi/libsas/sas_scsi_host.c | 2 +- drivers/scsi/megaraid.c | 4 ++-- drivers/scsi/megaraid.h | 2 +- drivers/scsi/megaraid/megaraid_sas_base.c | 4 ++-- drivers/scsi/mpi3mr/mpi3mr_os.c | 4 ++-- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 4 ++-- drivers/scsi/mvumi.c | 2 +- drivers/scsi/myrb.c | 2 +- drivers/scsi/pcmcia/sym53c500_cs.c | 2 +- drivers/scsi/ppa.c | 2 +- drivers/scsi/qla1280.c | 2 +- drivers/scsi/qlogicfas408.c | 2 +- drivers/scsi/qlogicfas408.h | 2 +- drivers/scsi/scsicam.c | 6 +++--- drivers/scsi/sd.c | 4 ++-- drivers/scsi/stex.c | 2 +- drivers/scsi/storvsc_drv.c | 2 +- drivers/scsi/wd719x.c | 2 +- include/linux/libata.h | 2 +- include/scsi/libsas.h | 2 +- include/scsi/scsi_host.h | 2 +- include/scsi/scsicam.h | 2 +- 46 files changed, 68 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/Documentation/scsi/scsi_mid_low_api.rst b/Documentation/scsi/scsi_mid_low_api.rst index 96c8230d638e..634f5c28a849 100644 --- a/Documentation/scsi/scsi_mid_low_api.rst +++ b/Documentation/scsi/scsi_mid_low_api.rst @@ -623,7 +623,7 @@ Details:: * bios_param - fetch head, sector, cylinder info for a disk * @sdev: pointer to scsi device context (defined in * include/scsi/scsi_device.h) - * @bdev: pointer to block device context (defined in fs.h) + * @disk: pointer to gendisk (defined in blkdev.h) * @capacity: device size (in 512 byte sectors) * @params: three element array to place output: * params[0] number of heads (max 255) @@ -643,7 +643,7 @@ Details:: * * Optionally defined in: LLD **/ - int bios_param(struct scsi_device * sdev, struct block_device *bdev, + int bios_param(struct scsi_device * sdev, struct gendisk *disk, sector_t capacity, int params[3]) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 57f674f51b0c..3603b430724b 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -351,7 +351,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups); /** * ata_std_bios_param - generic bios head/sector/cylinder calculator used by sd. * @sdev: SCSI device for which BIOS geometry is to be determined - * @bdev: block device associated with @sdev + * @unused: gendisk associated with @sdev * @capacity: capacity of SCSI device * @geom: location to which geometry will be output * @@ -366,7 +366,7 @@ EXPORT_SYMBOL_GPL(ata_common_sdev_groups); * RETURNS: * Zero. */ -int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, +int ata_std_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { geom[0] = 255; diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c index 3a64dc7a7e27..3304f8824cf7 100644 --- a/drivers/message/fusion/mptscsih.c +++ b/drivers/message/fusion/mptscsih.c @@ -2074,7 +2074,7 @@ mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, * This is anyones guess quite frankly. */ int -mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev, +mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads; diff --git a/drivers/message/fusion/mptscsih.h b/drivers/message/fusion/mptscsih.h index 8c2bb2331fc1..f9678d48100c 100644 --- a/drivers/message/fusion/mptscsih.h +++ b/drivers/message/fusion/mptscsih.h @@ -123,7 +123,7 @@ extern int mptscsih_abort(struct scsi_cmnd * SCpnt); extern int mptscsih_dev_reset(struct scsi_cmnd * SCpnt); extern int mptscsih_bus_reset(struct scsi_cmnd * SCpnt); extern int mptscsih_host_reset(struct scsi_cmnd *SCpnt); -extern int mptscsih_bios_param(struct scsi_device * sdev, struct block_device *bdev, sector_t capacity, int geom[]); +extern int mptscsih_bios_param(struct scsi_device * sdev, struct gendisk *unused, sector_t capacity, int geom[]); extern int mptscsih_io_done(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); extern int mptscsih_taskmgmt_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); extern int mptscsih_scandv_complete(MPT_ADAPTER *ioc, MPT_FRAME_HDR *mf, MPT_FRAME_HDR *r); diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index 883d4a12a172..a377a6f6900a 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c @@ -1695,7 +1695,7 @@ out: } /* End twa_reset_sequence() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int twa_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) +static int twa_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c index 8d4174c7107e..e319be7d369c 100644 --- a/drivers/scsi/3w-sas.c +++ b/drivers/scsi/3w-sas.c @@ -1404,7 +1404,7 @@ out: } /* End twl_reset_device_extension() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int twl_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]) +static int twl_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors; diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index 89bd56f78ef9..0306a228c702 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -1340,7 +1340,7 @@ static int tw_reset_device_extension(TW_Device_Extension *tw_dev) } /* End tw_reset_device_extension() */ /* This funciton returns unit geometry in cylinders/heads/sectors */ -static int tw_scsi_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int tw_scsi_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index 743be2ef6d1a..8b17d8103e90 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -3240,7 +3240,7 @@ static int blogic_resetadapter(struct blogic_adapter *adapter, bool hard_reset) the BIOS, and a warning may be displayed. */ -static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, +static int blogic_diskparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *params) { struct blogic_adapter *adapter = @@ -3261,7 +3261,7 @@ static int blogic_diskparam(struct scsi_device *sdev, struct block_device *dev, diskparam->sectors = 32; } diskparam->cylinders = (unsigned long) capacity / (diskparam->heads * diskparam->sectors); - buf = scsi_bios_ptable(dev->bd_disk); + buf = scsi_bios_ptable(disk); if (buf == NULL) return 0; /* diff --git a/drivers/scsi/BusLogic.h b/drivers/scsi/BusLogic.h index 61bf26d4fc10..79de815e33b0 100644 --- a/drivers/scsi/BusLogic.h +++ b/drivers/scsi/BusLogic.h @@ -1273,7 +1273,7 @@ static inline void blogic_incszbucket(unsigned int *cmdsz_buckets, static const char *blogic_drvr_info(struct Scsi_Host *); static int blogic_qcmd(struct Scsi_Host *h, struct scsi_cmnd *); -static int blogic_diskparam(struct scsi_device *, struct block_device *, sector_t, int *); +static int blogic_diskparam(struct scsi_device *, struct gendisk *, sector_t, int *); static int blogic_sdev_configure(struct scsi_device *, struct queue_limits *lim); static void blogic_qcompleted_ccb(struct blogic_ccb *); diff --git a/drivers/scsi/aacraid/linit.c b/drivers/scsi/aacraid/linit.c index 2264a97d91a0..ea66196ef7c7 100644 --- a/drivers/scsi/aacraid/linit.c +++ b/drivers/scsi/aacraid/linit.c @@ -273,7 +273,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype) /** * aac_biosparm - return BIOS parameters for disk * @sdev: The scsi device corresponding to the disk - * @bdev: the block device corresponding to the disk + * @disk: the gendisk corresponding to the disk * @capacity: the sector capacity of the disk * @geom: geometry block to fill in * @@ -292,7 +292,7 @@ struct aac_driver_ident* aac_get_driver_ident(int devtype) * be displayed. */ -static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, +static int aac_biosparm(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *geom) { struct diskparm *param = (struct diskparm *)geom; @@ -324,7 +324,7 @@ static int aac_biosparm(struct scsi_device *sdev, struct block_device *bdev, * entry whose end_head matches one of the standard geometry * translations ( 64/32, 128/32, 255/63 ). */ - buf = scsi_bios_ptable(bdev->bd_disk); + buf = scsi_bios_ptable(disk); if (!buf) return 0; if (*(__le16 *)(buf + 0x40) == cpu_to_le16(MSDOS_LABEL_MAGIC)) { diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c index 3a2c336307c0..063e1b5818d3 100644 --- a/drivers/scsi/advansys.c +++ b/drivers/scsi/advansys.c @@ -7096,7 +7096,7 @@ static int advansys_reset(struct scsi_cmnd *scp) * ip[2]: cylinders */ static int -advansys_biosparam(struct scsi_device *sdev, struct block_device *bdev, +advansys_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { struct asc_board *boardp = shost_priv(sdev->host); diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c index e94c0a19c435..182aa80ec4c6 100644 --- a/drivers/scsi/aha152x.c +++ b/drivers/scsi/aha152x.c @@ -1246,7 +1246,7 @@ int aha152x_host_reset_host(struct Scsi_Host *shpnt) * Return the "logical geometry" * */ -static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int aha152x_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int *info_array) { struct Scsi_Host *shpnt = sdev->host; @@ -1261,7 +1261,7 @@ static int aha152x_biosparam(struct scsi_device *sdev, struct block_device *bdev int info[3]; /* try to figure out the geometry from the partition table */ - if (scsicam_bios_param(bdev, capacity, info) < 0 || + if (scsicam_bios_param(disk, capacity, info) < 0 || !((info[0] == 64 && info[1] == 32) || (info[0] == 255 && info[1] == 63))) { if (EXT_TRANS) { printk(KERN_NOTICE diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c index 389499d3e00a..371e8300f029 100644 --- a/drivers/scsi/aha1542.c +++ b/drivers/scsi/aha1542.c @@ -992,7 +992,7 @@ static int aha1542_host_reset(struct scsi_cmnd *cmd) } static int aha1542_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int geom[]) + struct gendisk *unused, sector_t capacity, int geom[]) { struct aha1542_hostdata *aha1542 = shost_priv(sdev->host); diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c index be7ebbbb9ba8..b234621f6b37 100644 --- a/drivers/scsi/aha1740.c +++ b/drivers/scsi/aha1740.c @@ -510,7 +510,7 @@ static void aha1740_getconfig(unsigned int base, unsigned int *irq_level, } static int aha1740_biosparam(struct scsi_device *sdev, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int* ip) { int size = capacity; diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 2cff19e95fec..c3d1b9dd24ae 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -720,7 +720,7 @@ ahd_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * Return the disk geometry for the given SCSI device. */ static int -ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, +ahd_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { int heads; @@ -731,7 +731,7 @@ ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahd = *((struct ahd_softc **)sdev->host->hostdata); - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c index 05bdb73d1157..8b2b98666d61 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c @@ -683,7 +683,7 @@ ahc_linux_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * Return the disk geometry for the given SCSI device. */ static int -ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, +ahc_linux_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { int heads; @@ -696,7 +696,7 @@ ahc_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, ahc = *((struct ahc_softc **)sdev->host->hostdata); channel = sdev_channel(sdev); - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; heads = 64; diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 6968da9fb67c..f0c5a30ce51b 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -112,7 +112,7 @@ static int arcmsr_iop_confirm(struct AdapterControlBlock *acb); static int arcmsr_abort(struct scsi_cmnd *); static int arcmsr_bus_reset(struct scsi_cmnd *); static int arcmsr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int *info); + struct gendisk *disk, sector_t capacity, int *info); static int arcmsr_queue_command(struct Scsi_Host *h, struct scsi_cmnd *cmd); static int arcmsr_probe(struct pci_dev *pdev, const struct pci_device_id *id); @@ -377,11 +377,11 @@ static irqreturn_t arcmsr_do_interrupt(int irq, void *dev_id) } static int arcmsr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int *geom) + struct gendisk *disk, sector_t capacity, int *geom) { int heads, sectors, cylinders, total_capacity; - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; total_capacity = capacity; diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c index 401242912855..df6f40b51deb 100644 --- a/drivers/scsi/atp870u.c +++ b/drivers/scsi/atp870u.c @@ -1692,7 +1692,7 @@ static int atp870u_show_info(struct seq_file *m, struct Scsi_Host *HBAptr) } -static int atp870u_biosparam(struct scsi_device *disk, struct block_device *dev, +static int atp870u_biosparam(struct scsi_device *disk, struct gendisk *unused, sector_t capacity, int *ip) { int heads, sectors, cylinders; diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c index 4a3716dc644c..c0b2a980db34 100644 --- a/drivers/scsi/fdomain.c +++ b/drivers/scsi/fdomain.c @@ -469,10 +469,10 @@ static int fdomain_host_reset(struct scsi_cmnd *cmd) } static int fdomain_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, + struct gendisk *disk, sector_t capacity, int geom[]) { - unsigned char *p = scsi_bios_ptable(bdev->bd_disk); + unsigned char *p = scsi_bios_ptable(disk); if (p && p[65] == 0xaa && p[64] == 0x55 /* Partition table valid */ && p[4]) { /* Partition type */ diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c index 0821cf994b98..5c602c057798 100644 --- a/drivers/scsi/imm.c +++ b/drivers/scsi/imm.c @@ -954,7 +954,7 @@ static DEF_SCSI_QCMD(imm_queuecommand) * be done in sd.c. Even if it gets fixed there, this will still * work. */ -static int imm_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int imm_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { ip[0] = 0x40; diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c index 8648bd965287..ed34ad92c807 100644 --- a/drivers/scsi/initio.c +++ b/drivers/scsi/initio.c @@ -2645,7 +2645,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd) /** * i91u_biosparam - return the "logical geometry * @sdev: SCSI device - * @dev: Matching block device + * @unused: Matching gendisk * @capacity: Sector size of drive * @info_array: Return space for BIOS geometry * @@ -2655,7 +2655,7 @@ static int i91u_bus_reset(struct scsi_cmnd * cmnd) * FIXME: limited to 2^32 sector devices. */ -static int i91u_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int i91u_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int *info_array) { struct initio_host *host; /* Point to Host adapter control block */ diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index d06b79f03538..dd6754db7e4c 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -4644,10 +4644,10 @@ ATTRIBUTE_GROUPS(ipr_dev); /** * ipr_biosparam - Return the HSC mapping - * @sdev: scsi device struct - * @block_device: block device pointer + * @sdev: scsi device struct + * @unused: gendisk pointer * @capacity: capacity of the device - * @parm: Array containing returned HSC values. + * @parm: Array containing returned HSC values. * * This function generates the HSC parms that fdisk uses. * We want to make sure we return something that places partitions @@ -4657,7 +4657,7 @@ ATTRIBUTE_GROUPS(ipr_dev); * 0 on success **/ static int ipr_biosparam(struct scsi_device *sdev, - struct block_device *block_device, + struct gendisk *unused, sector_t capacity, int *parm) { int heads, sectors; diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 94adb6ac02a4..3393a288fd23 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -1123,7 +1123,7 @@ static DEF_SCSI_QCMD(ips_queue) /* Set bios geometry for the controller */ /* */ /****************************************************************************/ -static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { ips_ha_t *ha = (ips_ha_t *) sdev->host->hostdata; diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index 8ac932ec4444..30a4d4a580e9 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -398,7 +398,7 @@ /* * Scsi_Host Template */ - static int ips_biosparam(struct scsi_device *sdev, struct block_device *bdev, + static int ips_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]); static int ips_sdev_configure(struct scsi_device *SDptr, struct queue_limits *lim); diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index 928723c90b75..ffa5b49aaf08 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -845,7 +845,7 @@ int sas_change_queue_depth(struct scsi_device *sdev, int depth) EXPORT_SYMBOL_GPL(sas_change_queue_depth); int sas_bios_param(struct scsi_device *scsi_dev, - struct block_device *bdev, + struct gendisk *unused, sector_t capacity, int *hsc) { hsc[0] = 255; diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index c7581c7829af..a00622c0c526 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -2780,7 +2780,7 @@ static inline void mega_create_proc_entry(int index, struct proc_dir_entry *pare * Return the disk geometry for a particular disk */ static int -megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, +megaraid_biosparam(struct scsi_device *sdev, struct gendisk *disk, sector_t capacity, int geom[]) { adapter_t *adapter; @@ -2813,7 +2813,7 @@ megaraid_biosparam(struct scsi_device *sdev, struct block_device *bdev, geom[2] = cylinders; } else { - if (scsi_partsize(bdev->bd_disk, capacity, geom)) + if (scsi_partsize(disk, capacity, geom)) return 0; dev_info(&adapter->dev->dev, diff --git a/drivers/scsi/megaraid.h b/drivers/scsi/megaraid.h index 013fbfb911b9..d6bfd26a8843 100644 --- a/drivers/scsi/megaraid.h +++ b/drivers/scsi/megaraid.h @@ -975,7 +975,7 @@ static void mega_free_scb(adapter_t *, scb_t *); static int megaraid_abort(struct scsi_cmnd *); static int megaraid_reset(struct scsi_cmnd *); static int megaraid_abort_and_reset(adapter_t *, struct scsi_cmnd *, int); -static int megaraid_biosparam(struct scsi_device *, struct block_device *, +static int megaraid_biosparam(struct scsi_device *, struct gendisk *, sector_t, int []); static int mega_build_sglist (adapter_t *adapter, scb_t *scb, diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 615e06fd4ee8..abbbc4b36cd1 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -3137,12 +3137,12 @@ static int megasas_reset_target(struct scsi_cmnd *scmd) /** * megasas_bios_param - Returns disk geometry for a disk * @sdev: device handle - * @bdev: block device + * @unused: gendisk * @capacity: drive capacity * @geom: geometry parameters */ static int -megasas_bios_param(struct scsi_device *sdev, struct block_device *bdev, +megasas_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads; diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index e467b56949e9..3df52a3b435b 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -4031,7 +4031,7 @@ out: /** * mpi3mr_bios_param - BIOS param callback * @sdev: SCSI device reference - * @bdev: Block device reference + * @unused: gendisk reference * @capacity: Capacity in logical sectors * @params: Parameter array * @@ -4040,7 +4040,7 @@ out: * Return: 0 always */ static int mpi3mr_bios_param(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int params[]) + struct gendisk *unused, sector_t capacity, int params[]) { int heads; int sectors; diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index 967af259118e..7092d0debef3 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -2754,7 +2754,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) /** * scsih_bios_param - fetch head, sector, cylinder info for a disk * @sdev: scsi device struct - * @bdev: pointer to block device context + * @unused: pointer to gendisk * @capacity: device size (in 512 byte sectors) * @params: three element array to place output: * params[0] number of heads (max 255) @@ -2762,7 +2762,7 @@ scsih_sdev_configure(struct scsi_device *sdev, struct queue_limits *lim) * params[2] number of cylinders */ static int -scsih_bios_param(struct scsi_device *sdev, struct block_device *bdev, +scsih_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int params[]) { int heads; diff --git a/drivers/scsi/mvumi.c b/drivers/scsi/mvumi.c index 96549e7f5705..bdc2f2f17753 100644 --- a/drivers/scsi/mvumi.c +++ b/drivers/scsi/mvumi.c @@ -2142,7 +2142,7 @@ static enum scsi_timeout_action mvumi_timed_out(struct scsi_cmnd *scmd) } static int -mvumi_bios_param(struct scsi_device *sdev, struct block_device *bdev, +mvumi_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors; diff --git a/drivers/scsi/myrb.c b/drivers/scsi/myrb.c index 486db5b2f05d..b8453c0333dc 100644 --- a/drivers/scsi/myrb.c +++ b/drivers/scsi/myrb.c @@ -1745,7 +1745,7 @@ static void myrb_sdev_destroy(struct scsi_device *sdev) kfree(sdev->hostdata); } -static int myrb_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int myrb_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { struct myrb_hba *cb = shost_priv(sdev->host); diff --git a/drivers/scsi/pcmcia/sym53c500_cs.c b/drivers/scsi/pcmcia/sym53c500_cs.c index 278c78d066c4..a3b505240351 100644 --- a/drivers/scsi/pcmcia/sym53c500_cs.c +++ b/drivers/scsi/pcmcia/sym53c500_cs.c @@ -597,7 +597,7 @@ SYM53C500_host_reset(struct scsi_cmnd *SCpnt) static int SYM53C500_biosparm(struct scsi_device *disk, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int *info_array) { int size; diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c index 1ed3171f1797..ea682f3044b6 100644 --- a/drivers/scsi/ppa.c +++ b/drivers/scsi/ppa.c @@ -845,7 +845,7 @@ static DEF_SCSI_QCMD(ppa_queuecommand) * be done in sd.c. Even if it gets fixed there, this will still * work. */ -static int ppa_biosparam(struct scsi_device *sdev, struct block_device *dev, +static int ppa_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int ip[]) { ip[0] = 0x40; diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 6af018f1ca22..ef841f643171 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -1023,7 +1023,7 @@ qla1280_eh_adapter_reset(struct scsi_cmnd *cmd) } static int -qla1280_biosparam(struct scsi_device *sdev, struct block_device *bdev, +qla1280_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { int heads, sectors, cylinders; diff --git a/drivers/scsi/qlogicfas408.c b/drivers/scsi/qlogicfas408.c index 3e065d5fc80c..1ce469b7db99 100644 --- a/drivers/scsi/qlogicfas408.c +++ b/drivers/scsi/qlogicfas408.c @@ -492,7 +492,7 @@ DEF_SCSI_QCMD(qlogicfas408_queuecommand) * Return bios parameters */ -int qlogicfas408_biosparam(struct scsi_device *disk, struct block_device *dev, +int qlogicfas408_biosparam(struct scsi_device *disk, struct gendisk *unused, sector_t capacity, int ip[]) { /* This should mimic the DOS Qlogic driver's behavior exactly */ diff --git a/drivers/scsi/qlogicfas408.h b/drivers/scsi/qlogicfas408.h index a971db11d293..83ef86c71f2f 100644 --- a/drivers/scsi/qlogicfas408.h +++ b/drivers/scsi/qlogicfas408.h @@ -106,7 +106,7 @@ struct qlogicfas408_priv { irqreturn_t qlogicfas408_ihandl(int irq, void *dev_id); int qlogicfas408_queuecommand(struct Scsi_Host *h, struct scsi_cmnd * cmd); int qlogicfas408_biosparam(struct scsi_device * disk, - struct block_device *dev, + struct gendisk *unused, sector_t capacity, int ip[]); int qlogicfas408_abort(struct scsi_cmnd * cmd); extern int qlogicfas408_host_reset(struct scsi_cmnd *cmd); diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index 3ff2cf51d5b0..887de505bcf9 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -205,7 +205,7 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds /** * scsicam_bios_param - Determine geometry of a disk in cylinders/heads/sectors. - * @bdev: which device + * @disk: which device * @capacity: size of the disk in sectors * @ip: return value: ip[0]=heads, ip[1]=sectors, ip[2]=cylinders * @@ -215,13 +215,13 @@ static int setsize(unsigned long capacity, unsigned int *cyls, unsigned int *hds * * Returns : -1 on failure, 0 on success. */ -int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip) +int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip) { u64 capacity64 = capacity; /* Suppress gcc warning */ int ret = 0; /* try to infer mapping from partition table */ - if (scsi_partsize(bdev->bd_disk, capacity, ip)) + if (scsi_partsize(disk, capacity, ip)) return 0; if (capacity64 < (1ULL << 32)) { diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 5b8668accf8e..3edcc43f180b 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1614,9 +1614,9 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) - host->hostt->bios_param(sdp, bdev, capacity, diskinfo); + host->hostt->bios_param(sdp, bdev->bd_disk, capacity, diskinfo); else - scsicam_bios_param(bdev, capacity, diskinfo); + scsicam_bios_param(bdev->bd_disk, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c index 63ed7f9aaa93..d8ad02c29320 100644 --- a/drivers/scsi/stex.c +++ b/drivers/scsi/stex.c @@ -1457,7 +1457,7 @@ static void stex_reset_work(struct work_struct *work) } static int stex_biosparam(struct scsi_device *sdev, - struct block_device *bdev, sector_t capacity, int geom[]) + struct gendisk *unused, sector_t capacity, int geom[]) { int heads = 255, sectors = 63; diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index d9e59204a9c3..dc51ea352198 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1615,7 +1615,7 @@ static int storvsc_sdev_configure(struct scsi_device *sdevice, return 0; } -static int storvsc_get_chs(struct scsi_device *sdev, struct block_device * bdev, +static int storvsc_get_chs(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int *info) { sector_t nsect = capacity; diff --git a/drivers/scsi/wd719x.c b/drivers/scsi/wd719x.c index 5a380eecfc75..0c9987828774 100644 --- a/drivers/scsi/wd719x.c +++ b/drivers/scsi/wd719x.c @@ -544,7 +544,7 @@ static int wd719x_host_reset(struct scsi_cmnd *cmd) return wd719x_chip_init(wd) == 0 ? SUCCESS : FAILED; } -static int wd719x_biosparam(struct scsi_device *sdev, struct block_device *bdev, +static int wd719x_biosparam(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]) { if (capacity >= 0x200000) { diff --git a/include/linux/libata.h b/include/linux/libata.h index 0620dd67369f..21de0935775d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1203,7 +1203,7 @@ extern void ata_qc_complete(struct ata_queued_cmd *qc); extern u64 ata_qc_get_active(struct ata_port *ap); extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); extern int ata_std_bios_param(struct scsi_device *sdev, - struct block_device *bdev, + struct gendisk *unused, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); extern int ata_scsi_sdev_init(struct scsi_device *sdev); diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index ba460b6c0374..9c6e90829dbd 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -685,7 +685,7 @@ extern int sas_queuecommand(struct Scsi_Host *, struct scsi_cmnd *); extern int sas_target_alloc(struct scsi_target *); int sas_sdev_configure(struct scsi_device *dev, struct queue_limits *lim); extern int sas_change_queue_depth(struct scsi_device *, int new_depth); -extern int sas_bios_param(struct scsi_device *, struct block_device *, +extern int sas_bios_param(struct scsi_device *, struct gendisk *, sector_t capacity, int *hsc); int sas_execute_internal_abort_single(struct domain_device *device, u16 tag, unsigned int qid, diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index c53812b9026f..f5a243261236 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -318,7 +318,7 @@ struct scsi_host_template { * * Status: OPTIONAL */ - int (* bios_param)(struct scsi_device *, struct block_device *, + int (* bios_param)(struct scsi_device *, struct gendisk *, sector_t, int []); /* diff --git a/include/scsi/scsicam.h b/include/scsi/scsicam.h index 67f4e8835bc8..1131f51ed2c8 100644 --- a/include/scsi/scsicam.h +++ b/include/scsi/scsicam.h @@ -14,7 +14,7 @@ #ifndef SCSICAM_H #define SCSICAM_H struct gendisk; -int scsicam_bios_param(struct block_device *bdev, sector_t capacity, int *ip); +int scsicam_bios_param(struct gendisk *disk, sector_t capacity, int *ip); bool scsi_partsize(struct gendisk *disk, sector_t capacity, int geom[3]); unsigned char *scsi_bios_ptable(struct gendisk *disk); #endif /* def SCSICAM_H */ -- cgit v1.2.3 From 4fc8728aa34f54835b72e4db0f3db76a72948b65 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 May 2024 22:19:55 -0400 Subject: block: switch ->getgeo() to struct gendisk Instances are happier that way and it makes more sense anyway - the only part of the result that is related to partition we are given is the start sector, and that has been filled in by the caller. Everything else is a function of the disk. Only one instance (DASD) is ever looking at anything other than bdev->bd_disk and that one is trivial to adjust. Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Acked-by: Jens Axboe Signed-off-by: Al Viro --- Documentation/filesystems/locking.rst | 2 +- arch/m68k/emu/nfblock.c | 4 ++-- arch/um/drivers/ubd_kern.c | 6 +++--- block/ioctl.c | 4 ++-- block/partitions/ibm.c | 2 +- drivers/block/amiflop.c | 10 +++++----- drivers/block/aoe/aoeblk.c | 4 ++-- drivers/block/floppy.c | 4 ++-- drivers/block/mtip32xx/mtip32xx.c | 6 +++--- drivers/block/rnbd/rnbd-clt.c | 4 ++-- drivers/block/sunvdc.c | 3 +-- drivers/block/swim.c | 4 ++-- drivers/block/virtio_blk.c | 6 +++--- drivers/block/xen-blkfront.c | 4 ++-- drivers/md/dm.c | 4 ++-- drivers/md/md.c | 4 ++-- drivers/memstick/core/ms_block.c | 4 ++-- drivers/memstick/core/mspro_block.c | 4 ++-- drivers/mmc/core/block.c | 4 ++-- drivers/mtd/mtd_blkdevs.c | 4 ++-- drivers/mtd/ubi/block.c | 4 ++-- drivers/nvdimm/btt.c | 4 ++-- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/nvme.h | 2 +- drivers/s390/block/dasd.c | 7 ++++--- drivers/scsi/sd.c | 8 ++++---- include/linux/blkdev.h | 2 +- 27 files changed, 59 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index aa287ccdac2f..77704fde9845 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -443,7 +443,7 @@ prototypes:: int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); void (*unlock_native_capacity) (struct gendisk *); - int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*getgeo)(struct gendisk *, struct hd_geometry *); void (*swap_slot_free_notify) (struct block_device *, unsigned long); locking rules: diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 83410f8184ec..94a4fadc651a 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -77,9 +77,9 @@ static void nfhd_submit_bio(struct bio *bio) bio_endio(bio); } -static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int nfhd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct nfhd_device *dev = bdev->bd_disk->private_data; + struct nfhd_device *dev = disk->private_data; geo->cylinders = dev->blocks >> (6 - dev->bshift); geo->heads = 4; diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 4de6613e7468..f2b2feeeb455 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -108,7 +108,7 @@ static DEFINE_MUTEX(ubd_lock); static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); -static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); +static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo); #define MAX_DEV (16) @@ -1324,9 +1324,9 @@ static blk_status_t ubd_queue_rq(struct blk_mq_hw_ctx *hctx, return res; } -static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int ubd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct ubd *ubd_dev = bdev->bd_disk->private_data; + struct ubd *ubd_dev = disk->private_data; geo->heads = 128; geo->sectors = 32; diff --git a/block/ioctl.c b/block/ioctl.c index f7b0006ca45d..a14304f737c5 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -481,7 +481,7 @@ static int blkdev_getgeo(struct block_device *bdev, */ memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); + ret = disk->fops->getgeo(disk, &geo); if (ret) return ret; if (copy_to_user(argp, &geo, sizeof(geo))) @@ -515,7 +515,7 @@ static int compat_hdio_getgeo(struct block_device *bdev, * want to override it. */ geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); + ret = disk->fops->getgeo(disk, &geo); if (ret) return ret; diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 82d9c4c3fb41..631291fbb356 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -358,7 +358,7 @@ int ibm_partition(struct parsed_partitions *state) goto out_nolab; /* set start if not filled by getgeo function e.g. virtblk */ geo->start = get_start_sect(bdev); - if (disk->fops->getgeo(bdev, geo)) + if (disk->fops->getgeo(disk, geo)) goto out_freeall; if (!fn || fn(disk, info)) { kfree(info); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 6357d86eafdc..2932b6653b6f 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1523,13 +1523,13 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - int drive = MINOR(bdev->bd_dev) & 3; + struct amiga_floppy_struct *p = disk->private_data; - geo->heads = unit[drive].type->heads; - geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult; - geo->cylinders = unit[drive].type->tracks; + geo->heads = p->type->heads; + geo->sectors = p->dtype->sects * p->type->sect_mult; + geo->cylinders = p->type->tracks; return 0; } diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 00b74a845328..34ead75e7e02 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -269,9 +269,9 @@ static blk_status_t aoeblk_queue_rq(struct blk_mq_hw_ctx *hctx, } static int -aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +aoeblk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct aoedev *d = bdev->bd_disk->private_data; + struct aoedev *d = disk->private_data; if ((d->flags & DEVFL_UP) == 0) { printk(KERN_ERR "aoe: disk not up\n"); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 24be0c2c4075..eb125a36ec24 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3363,9 +3363,9 @@ static int get_floppy_geometry(int drive, int type, struct floppy_struct **g) return 0; } -static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int fd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - int drive = (long)bdev->bd_disk->private_data; + int drive = (long)disk->private_data; int type = ITYPE(drive_state[drive].fd_device); struct floppy_struct *g; int ret; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 8fc7761397bd..567192e371a8 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3148,17 +3148,17 @@ static int mtip_block_compat_ioctl(struct block_device *dev, * that each partition is also 4KB aligned. Non-aligned partitions adversely * affects performance. * - * @dev Pointer to the block_device strucutre. + * @disk Pointer to the gendisk strucutre. * @geo Pointer to a hd_geometry structure. * * return value * 0 Operation completed successfully. * -ENOTTY An error occurred while reading the drive capacity. */ -static int mtip_block_getgeo(struct block_device *dev, +static int mtip_block_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct driver_data *dd = dev->bd_disk->private_data; + struct driver_data *dd = disk->private_data; sector_t capacity; if (!dd) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 15627417f12e..119b7e27023f 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -942,11 +942,11 @@ static void rnbd_client_release(struct gendisk *gen) rnbd_clt_put_dev(dev); } -static int rnbd_client_getgeo(struct block_device *block_device, +static int rnbd_client_getgeo(struct gendisk *disk, struct hd_geometry *geo) { u64 size; - struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; + struct rnbd_clt_dev *dev = disk->private_data; struct queue_limits *limit = &dev->queue->limits; size = dev->size * (limit->logical_block_size / SECTOR_SIZE); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 7af21fe67671..107751721178 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -119,9 +119,8 @@ static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr) return vio_dring_avail(dr, VDC_TX_RING_SIZE); } -static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int vdc_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct gendisk *disk = bdev->bd_disk; sector_t nsect = get_capacity(disk); sector_t cylinders = nsect; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index eda33c5eb5e2..416015947ae6 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -711,9 +711,9 @@ static int floppy_ioctl(struct block_device *bdev, blk_mode_t mode, return -ENOTTY; } -static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int floppy_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct floppy_state *fs = bdev->bd_disk->private_data; + struct floppy_state *fs = disk->private_data; struct floppy_struct *g; int ret; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index e649fa67bac1..85efa7e535f8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -829,9 +829,9 @@ out: } /* We provide getgeo only to please some old bootloader/partitioning tools */ -static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) +static int virtblk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct virtio_blk *vblk = bd->bd_disk->private_data; + struct virtio_blk *vblk = disk->private_data; int ret = 0; mutex_lock(&vblk->vdev_mutex); @@ -853,7 +853,7 @@ static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) /* some standard values, similar to sd */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; } out: mutex_unlock(&vblk->vdev_mutex); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5babe575c288..04fc6b552c04 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -493,11 +493,11 @@ static void blkif_restart_queue_callback(void *arg) schedule_work(&rinfo->work); } -static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) +static int blkif_getgeo(struct gendisk *disk, struct hd_geometry *hg) { /* We don't have real geometry info, but let's at least return values consistent with the size of the device */ - sector_t nsect = get_capacity(bd->bd_disk); + sector_t nsect = get_capacity(disk); sector_t cylinders = nsect; hg->heads = 0xff; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a44e8c2dccee..7bd6fa05b00a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -403,9 +403,9 @@ static void do_deferred_remove(struct work_struct *w) dm_deferred_remove(); } -static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dm_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mapped_device *md = bdev->bd_disk->private_data; + struct mapped_device *md = disk->private_data; return dm_get_geometry(md, geo); } diff --git a/drivers/md/md.c b/drivers/md/md.c index ac85ec73a409..236bb04f3e54 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7678,9 +7678,9 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) * 4 sectors (with a BIG number of cylinders...). This drives * dosfs just mad... ;-) */ -static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int md_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mddev *mddev = bdev->bd_disk->private_data; + struct mddev *mddev = disk->private_data; geo->heads = 2; geo->sectors = 4; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index d34892782f6e..1af157ce0a63 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1953,10 +1953,10 @@ static void msb_data_clear(struct msb_data *msb) msb->card = NULL; } -static int msb_bd_getgeo(struct block_device *bdev, +static int msb_bd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct msb_data *msb = bdev->bd_disk->private_data; + struct msb_data *msb = disk->private_data; *geo = msb->geometry; return 0; } diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index c9853d887d28..075519caa547 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -189,10 +189,10 @@ static void mspro_block_bd_free_disk(struct gendisk *disk) kfree(msb); } -static int mspro_block_bd_getgeo(struct block_device *bdev, +static int mspro_block_bd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mspro_block_data *msb = bdev->bd_disk->private_data; + struct mspro_block_data *msb = disk->private_data; geo->heads = msb->heads; geo->sectors = msb->sectors_per_track; diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 9cc47bf94804..d1f295af9713 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -435,9 +435,9 @@ static void mmc_blk_release(struct gendisk *disk) } static int -mmc_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +mmc_blk_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - geo->cylinders = get_capacity(bdev->bd_disk) / (4 * 16); + geo->cylinders = get_capacity(disk) / (4 * 16); geo->heads = 4; geo->sectors = 16; return 0; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 847c11542f02..28e09d080440 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -246,9 +246,9 @@ unlock: blktrans_dev_put(dev); } -static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int blktrans_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data; + struct mtd_blktrans_dev *dev = disk->private_data; int ret = -ENXIO; mutex_lock(&dev->lock); diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 39cc0a6a4d37..b53fd147fa65 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -282,12 +282,12 @@ static void ubiblock_release(struct gendisk *gd) mutex_unlock(&dev->dev_mutex); } -static int ubiblock_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int ubiblock_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* Some tools might require this information */ geo->heads = 1; geo->cylinders = 1; - geo->sectors = get_capacity(bdev->bd_disk); + geo->sectors = get_capacity(disk); geo->start = 0; return 0; } diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 2a1aa32e6693..a933db961ed7 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1478,12 +1478,12 @@ static void btt_submit_bio(struct bio *bio) bio_endio(bio); } -static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) +static int btt_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; return 0; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 812c1565114f..f96f74bff6ad 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1803,12 +1803,12 @@ static void nvme_release(struct gendisk *disk) nvme_ns_release(disk->private_data); } -int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bdev->bd_disk) >> 11; + geo->cylinders = get_capacity(disk) >> 11; return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cfd2b5b90b91..102fae6a231c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -936,7 +936,7 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_id_ns **id); -int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); +int nvme_getgeo(struct gendisk *disk, struct hd_geometry *geo); int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); extern const struct attribute_group *nvme_ns_attr_groups[]; diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 506a947d00a5..582ac7e61b24 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3317,11 +3317,11 @@ static void dasd_release(struct gendisk *disk) /* * Return disk geometry. */ -static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int dasd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { struct dasd_device *base; - base = dasd_device_from_gendisk(bdev->bd_disk); + base = dasd_device_from_gendisk(disk); if (!base) return -ENODEV; @@ -3331,7 +3331,8 @@ static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) return -EINVAL; } base->discipline->fill_geometry(base->block, geo); - geo->start = get_start_sect(bdev) >> base->block->s2b_shift; + // geo->start is left unchanged by the above + geo->start >>= base->block->s2b_shift; dasd_put_device(base); return 0; } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 3edcc43f180b..00ad574ce61c 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1599,9 +1599,9 @@ static void sd_release(struct gendisk *disk) scsi_device_put(sdev); } -static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int sd_getgeo(struct gendisk *disk, struct hd_geometry *geo) { - struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); + struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; struct Scsi_Host *host = sdp->host; sector_t capacity = logical_to_sectors(sdp, sdkp->capacity); @@ -1614,9 +1614,9 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) - host->hostt->bios_param(sdp, bdev->bd_disk, capacity, diskinfo); + host->hostt->bios_param(sdp, disk, capacity, diskinfo); else - scsicam_bios_param(bdev->bd_disk, capacity, diskinfo); + scsicam_bios_param(disk, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 95886b404b16..fbc45121cd4f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1659,7 +1659,7 @@ struct block_device_operations { unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); - int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*getgeo)(struct gendisk *, struct hd_geometry *); int (*set_read_only)(struct block_device *bdev, bool ro); void (*free_disk)(struct gendisk *disk); /* this callback is with swap_lock and sometimes page table lock held */ -- cgit v1.2.3 From 4c70fb2624ab1588faa58dcd407d4c61d64b288d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 13 Aug 2025 08:29:01 +0000 Subject: cpuset: remove redundant CS_ONLINE flag The CS_ONLINE flag was introduced prior to the CSS_ONLINE flag in the cpuset subsystem. Currently, the flag setting sequence is as follows: 1. cpuset_css_online() sets CS_ONLINE 2. css->flags gets CSS_ONLINE set ... 3. cgroup->kill_css sets CSS_DYING 4. cpuset_css_offline() clears CS_ONLINE 5. css->flags clears CSS_ONLINE The is_cpuset_online() check currently occurs between steps 1 and 3. However, it would be equally safe to perform this check between steps 2 and 3, as CSS_ONLINE provides the same synchronization guarantee as CS_ONLINE. Since CS_ONLINE is redundant with CSS_ONLINE and provides no additional synchronization benefits, we can safely remove it to simplify the code. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 +++++ kernel/cgroup/cpuset-internal.h | 3 +-- kernel/cgroup/cpuset.c | 4 +--- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..ae73dbb19165 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -354,6 +354,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return css->flags & CSS_DYING; } +static inline bool css_is_online(struct cgroup_subsys_state *css) +{ + return css->flags & CSS_ONLINE; +} + static inline bool css_is_self(struct cgroup_subsys_state *css) { if (css == &css->cgroup->self) { diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 383963e28ac6..75b3aef39231 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -38,7 +38,6 @@ enum prs_errcode { /* bits in struct cpuset flags field */ typedef enum { - CS_ONLINE, CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, @@ -202,7 +201,7 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); + return css_is_online(&cs->css) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 27adb04df675..3466ebbf1016 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -207,7 +207,7 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) * parallel, we may leave an offline CPU in cpu_allowed or some other masks. */ static struct cpuset top_cpuset = { - .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + .flags = BIT(CS_CPU_EXCLUSIVE) | BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, @@ -3496,7 +3496,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpus_read_lock(); mutex_lock(&cpuset_mutex); - set_bit(CS_ONLINE, &cs->flags); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) @@ -3571,7 +3570,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); - clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); -- cgit v1.2.3 From 2caa6b88e0ba0231fb4ff0ba8e73cedd5fb81fc8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 11 Aug 2025 14:08:04 +0200 Subject: bpf: Don't use %pK through printk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the past %pK was preferable to %p as it would not leak raw pointer values into the kernel log. Since commit ad67b74d2469 ("printk: hash addresses printed with %p") the regular %p has been improved to avoid this issue. Furthermore, restricted pointers ("%pK") were never meant to be used through printk(). They can still unintentionally leak raw pointers or acquire sleeping locks in atomic contexts. Switch to the regular pointer formatting which is safer and easier to reason about. Signed-off-by: Thomas Weißschuh Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250811-restricted-pointers-bpf-v1-1-a1d7cc3cb9e7@linutronix.de --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e7fd3ee759e..52fecb7a1fe3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1296,7 +1296,7 @@ void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { - pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, + pr_err("flen=%u proglen=%u pass=%u image=%p from=%s pid=%d\n", flen, proglen, pass, image, current->comm, task_pid_nr(current)); if (image) -- cgit v1.2.3 From 139235103f6039c2c77cb8f51cb2e7e610fe0114 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Mon, 11 Aug 2025 15:35:05 +0800 Subject: net: stmmac: Change first parameter of fix_soc_reset() In order to use netdev_err() to print message in the callback function of fix_soc_reset(), change fix_soc_reset() to have "struct stmmac_priv *" as its first parameter. This is preparation for later patch, no functionality change. Suggested-by: Andrew Lunn Signed-off-by: Tiezhu Yang Link: https://patch.msgid.link/20250811073506.27513-3-yangtiezhu@loongson.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 6 +++--- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 +- drivers/net/ethernet/stmicro/stmmac/hwif.c | 2 +- include/linux/stmmac.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index 889e2bb6f7f5..c2d9e89f0063 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -49,7 +49,7 @@ struct imx_dwmac_ops { u32 flags; bool mac_rgmii_txclk_auto_adj; - int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); + int (*fix_soc_reset)(struct stmmac_priv *priv, void __iomem *ioaddr); int (*set_intf_mode)(struct plat_stmmacenet_data *plat_dat); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); }; @@ -265,9 +265,9 @@ static void imx93_dwmac_fix_speed(void *priv, int speed, unsigned int mode) writel(old_ctrl, dwmac->base_addr + MAC_CTRL_REG); } -static int imx_dwmac_mx93_reset(void *priv, void __iomem *ioaddr) +static int imx_dwmac_mx93_reset(struct stmmac_priv *priv, void __iomem *ioaddr) { - struct plat_stmmacenet_data *plat_dat = priv; + struct plat_stmmacenet_data *plat_dat = priv->plat; u32 value = readl(ioaddr + DMA_BUS_MODE); /* DMA SW reset */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 5769165ee5ba..4c477d6ce1e6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -509,7 +509,7 @@ static int loongson_dwmac_acpi_config(struct pci_dev *pdev, } /* Loongson's DWMAC device may take nearly two seconds to complete DMA reset */ -static int loongson_dwmac_fix_reset(void *priv, void __iomem *ioaddr) +static int loongson_dwmac_fix_reset(struct stmmac_priv *priv, void __iomem *ioaddr) { u32 value = readl(ioaddr + DMA_BUS_MODE); diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.c b/drivers/net/ethernet/stmicro/stmmac/hwif.c index 99635b37044a..3f7c765dcb79 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.c +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.c @@ -100,7 +100,7 @@ int stmmac_reset(struct stmmac_priv *priv, void __iomem *ioaddr) return -EINVAL; if (plat && plat->fix_soc_reset) - return plat->fix_soc_reset(plat, ioaddr); + return plat->fix_soc_reset(priv, ioaddr); return stmmac_do_callback(priv, dma, reset, ioaddr); } diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 22c24dacbc65..e284f04964bf 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -238,7 +238,7 @@ struct plat_stmmacenet_data { int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); - int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); + int (*fix_soc_reset)(struct stmmac_priv *priv, void __iomem *ioaddr); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); int (*mac_finish)(struct net_device *ndev, -- cgit v1.2.3 From 96326447d466ca62b713f660cfc73ef7879151a0 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 12 Aug 2025 06:57:23 +0200 Subject: net: mediatek: wed: Introduce MT7992 WED support to MT7988 SoC Introduce the second WDMA RX ring in WED driver for MT7988 SoC since the Mediatek MT7992 WiFi chipset supports two separated WDMA rings. Add missing MT7988 configurations to properly support WED for MT7992 in MT76 driver. Co-developed-by: Rex Lu Signed-off-by: Rex Lu Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250812-mt7992-wed-support-v3-1-9ada78a819a4@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_wed.c | 33 +++++++++++++++++++----- drivers/net/ethernet/mediatek/mtk_wed.h | 2 +- drivers/net/wireless/mediatek/mt76/mt7915/mmio.c | 6 ++--- drivers/net/wireless/mediatek/mt76/mt7996/mmio.c | 12 ++++----- include/linux/soc/mediatek/mtk_wed.h | 2 +- 5 files changed, 38 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c index 0a80d8f8cff7..3dbb113b792c 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.c +++ b/drivers/net/ethernet/mediatek/mtk_wed.c @@ -59,7 +59,9 @@ struct mtk_wed_flow_block_priv { static const struct mtk_wed_soc_data mt7622_data = { .regmap = { .tx_bm_tkid = 0x088, - .wpdma_rx_ring0 = 0x770, + .wpdma_rx_ring = { + 0x770, + }, .reset_idx_tx_mask = GENMASK(3, 0), .reset_idx_rx_mask = GENMASK(17, 16), }, @@ -70,7 +72,9 @@ static const struct mtk_wed_soc_data mt7622_data = { static const struct mtk_wed_soc_data mt7986_data = { .regmap = { .tx_bm_tkid = 0x0c8, - .wpdma_rx_ring0 = 0x770, + .wpdma_rx_ring = { + 0x770, + }, .reset_idx_tx_mask = GENMASK(1, 0), .reset_idx_rx_mask = GENMASK(7, 6), }, @@ -81,7 +85,10 @@ static const struct mtk_wed_soc_data mt7986_data = { static const struct mtk_wed_soc_data mt7988_data = { .regmap = { .tx_bm_tkid = 0x0c8, - .wpdma_rx_ring0 = 0x7d0, + .wpdma_rx_ring = { + 0x7d0, + 0x7d8, + }, .reset_idx_tx_mask = GENMASK(1, 0), .reset_idx_rx_mask = GENMASK(7, 6), }, @@ -621,8 +628,8 @@ mtk_wed_amsdu_init(struct mtk_wed_device *dev) return ret; } - /* eagle E1 PCIE1 tx ring 22 flow control issue */ - if (dev->wlan.id == 0x7991) + /* Kite and Eagle E1 PCIE1 tx ring 22 flow control issue */ + if (dev->wlan.id == 0x7991 || dev->wlan.id == 0x7992) wed_clr(dev, MTK_WED_AMSDU_FIFO, MTK_WED_AMSDU_IS_PRIOR0_RING); wed_set(dev, MTK_WED_CTRL, MTK_WED_CTRL_TX_AMSDU_EN); @@ -1239,7 +1246,11 @@ mtk_wed_set_wpdma(struct mtk_wed_device *dev) return; wed_w32(dev, MTK_WED_WPDMA_RX_GLO_CFG, dev->wlan.wpdma_rx_glo); - wed_w32(dev, dev->hw->soc->regmap.wpdma_rx_ring0, dev->wlan.wpdma_rx); + wed_w32(dev, dev->hw->soc->regmap.wpdma_rx_ring[0], + dev->wlan.wpdma_rx[0]); + if (mtk_wed_is_v3_or_greater(dev->hw)) + wed_w32(dev, dev->hw->soc->regmap.wpdma_rx_ring[1], + dev->wlan.wpdma_rx[1]); if (!dev->wlan.hw_rro) return; @@ -2323,6 +2334,16 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask) if (!dev->rx_wdma[i].desc) mtk_wed_wdma_rx_ring_setup(dev, i, 16, false); + if (dev->wlan.hw_rro) { + for (i = 0; i < MTK_WED_RX_PAGE_QUEUES; i++) { + u32 addr = MTK_WED_RRO_MSDU_PG_CTRL0(i) + + MTK_WED_RING_OFS_COUNT; + + if (!wed_r32(dev, addr)) + wed_w32(dev, addr, 1); + } + } + mtk_wed_hw_init(dev); mtk_wed_configure_irq(dev, irq_mask); diff --git a/drivers/net/ethernet/mediatek/mtk_wed.h b/drivers/net/ethernet/mediatek/mtk_wed.h index c1f0479d7a71..b49aee9a8b65 100644 --- a/drivers/net/ethernet/mediatek/mtk_wed.h +++ b/drivers/net/ethernet/mediatek/mtk_wed.h @@ -17,7 +17,7 @@ struct mtk_wed_wo; struct mtk_wed_soc_data { struct { u32 tx_bm_tkid; - u32 wpdma_rx_ring0; + u32 wpdma_rx_ring[MTK_WED_RX_QUEUES]; u32 reset_idx_tx_mask; u32 reset_idx_rx_mask; } regmap; diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c index 4a82f8e4c118..36488aa6cc20 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mmio.c @@ -664,8 +664,8 @@ int mt7915_mmio_wed_init(struct mt7915_dev *dev, void *pdev_ptr, MT_RXQ_WED_RING_BASE; wed->wlan.wpdma_rx_glo = pci_resource_start(pci_dev, 0) + MT_WPDMA_GLO_CFG; - wed->wlan.wpdma_rx = pci_resource_start(pci_dev, 0) + - MT_RXQ_WED_DATA_RING_BASE; + wed->wlan.wpdma_rx[0] = pci_resource_start(pci_dev, 0) + + MT_RXQ_WED_DATA_RING_BASE; } else { struct platform_device *plat_dev = pdev_ptr; struct resource *res; @@ -687,7 +687,7 @@ int mt7915_mmio_wed_init(struct mt7915_dev *dev, void *pdev_ptr, wed->wlan.wpdma_tx = res->start + MT_TXQ_WED_RING_BASE; wed->wlan.wpdma_txfree = res->start + MT_RXQ_WED_RING_BASE; wed->wlan.wpdma_rx_glo = res->start + MT_WPDMA_GLO_CFG; - wed->wlan.wpdma_rx = res->start + MT_RXQ_WED_DATA_RING_BASE; + wed->wlan.wpdma_rx[0] = res->start + MT_RXQ_WED_DATA_RING_BASE; } wed->wlan.nbuf = MT7915_HW_TOKEN_SIZE; wed->wlan.tx_tbit[0] = is_mt7915(&dev->mt76) ? 4 : 30; diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c b/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c index 30b40f4a91be..fb2428a9b877 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mmio.c @@ -503,9 +503,9 @@ int mt7996_mmio_wed_init(struct mt7996_dev *dev, void *pdev_ptr, } wed->wlan.wpdma_rx_glo = wed->wlan.phy_base + hif1_ofs + MT_WFDMA0_GLO_CFG; - wed->wlan.wpdma_rx = wed->wlan.phy_base + hif1_ofs + - MT_RXQ_RING_BASE(MT7996_RXQ_BAND0) + - MT7996_RXQ_BAND0 * MT_RING_SIZE; + wed->wlan.wpdma_rx[0] = wed->wlan.phy_base + hif1_ofs + + MT_RXQ_RING_BASE(MT7996_RXQ_BAND0) + + MT7996_RXQ_BAND0 * MT_RING_SIZE; wed->wlan.id = MT7996_DEVICE_ID_2; wed->wlan.tx_tbit[0] = ffs(MT_INT_TX_DONE_BAND2) - 1; @@ -518,9 +518,9 @@ int mt7996_mmio_wed_init(struct mt7996_dev *dev, void *pdev_ptr, wed->wlan.wpdma_rx_glo = wed->wlan.phy_base + MT_WFDMA0_GLO_CFG; - wed->wlan.wpdma_rx = wed->wlan.phy_base + - MT_RXQ_RING_BASE(MT7996_RXQ_BAND0) + - MT7996_RXQ_BAND0 * MT_RING_SIZE; + wed->wlan.wpdma_rx[0] = wed->wlan.phy_base + + MT_RXQ_RING_BASE(MT7996_RXQ_BAND0) + + MT7996_RXQ_BAND0 * MT_RING_SIZE; wed->wlan.wpdma_rx_rro[0] = wed->wlan.phy_base + MT_RXQ_RING_BASE(MT7996_RXQ_RRO_BAND0) + diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index d8949a4ed0dc..c4ff6bab176d 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -147,7 +147,7 @@ struct mtk_wed_device { u32 wpdma_tx; u32 wpdma_txfree; u32 wpdma_rx_glo; - u32 wpdma_rx; + u32 wpdma_rx[MTK_WED_RX_QUEUES]; u32 wpdma_rx_rro[MTK_WED_RX_QUEUES]; u32 wpdma_rx_pg; -- cgit v1.2.3 From fb2f2a86f0cd9690357b9bb67af00d386a7e819f Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Mon, 16 Jun 2025 13:03:21 +0200 Subject: ice: cleanup capabilities evaluation When evaluating the capabilities field, the ICE_AQC_BIT_ROCEV2_LAG and ICE_AQC_BIT_SRIOV_LAG defines were both not using the BIT operator, instead simply setting a hex value that set the correct bits. While not inaccurate, this method is misleading, and when it is expanded in the following implementation it becomes even more confusing. Switch to using the BIT() operator to clarify what is being checked. Reviewed-by: Przemek Kitszel Reviewed-by: Aleksandr Loktionov Reviewed-by: Marcin Szycik Signed-off-by: Dave Ertman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- include/linux/net/intel/libie/adminq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h index 012b5d499c1a..dbe93f940ef0 100644 --- a/include/linux/net/intel/libie/adminq.h +++ b/include/linux/net/intel/libie/adminq.h @@ -192,8 +192,8 @@ LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_list_caps); #define LIBIE_AQC_CAPS_TX_SCHED_TOPO_COMP_MODE 0x0085 #define LIBIE_AQC_CAPS_NAC_TOPOLOGY 0x0087 #define LIBIE_AQC_CAPS_FW_LAG_SUPPORT 0x0092 -#define LIBIE_AQC_BIT_ROCEV2_LAG 0x01 -#define LIBIE_AQC_BIT_SRIOV_LAG 0x02 +#define LIBIE_AQC_BIT_ROCEV2_LAG BIT(0) +#define LIBIE_AQC_BIT_SRIOV_LAG BIT(1) #define LIBIE_AQC_CAPS_FLEX10 0x00F1 #define LIBIE_AQC_CAPS_CEM 0x00F2 -- cgit v1.2.3 From 8d90041a0d285044b89629f539ca0685e156848b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 14 Aug 2025 11:22:13 -0400 Subject: dlm: handle release_option as unsigned Future patches will introduce a invalid argument check for undefined values. All values for release_option are positive integer values to not check on negative values as well we just change the parameter to unsigned int. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 8 ++++---- include/linux/dlm.h | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d986b7ef153d..8eadbf0fdf17 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -676,7 +676,7 @@ int dlm_new_user_lockspace(const char *name, const char *cluster, This is because there may be LKBs queued as ASTs that have been unlinked from their RSBs and are pending deletion once the AST has been delivered */ -static int lockspace_busy(struct dlm_ls *ls, int release_option) +static int lockspace_busy(struct dlm_ls *ls, unsigned int release_option) { struct dlm_lkb *lkb; unsigned long id; @@ -704,7 +704,7 @@ static int lockspace_busy(struct dlm_ls *ls, int release_option) return rv; } -static int release_lockspace(struct dlm_ls *ls, int release_option) +static int release_lockspace(struct dlm_ls *ls, unsigned int release_option) { int busy, rv; @@ -792,7 +792,7 @@ static int release_lockspace(struct dlm_ls *ls, int release_option) * See DLM_RELEASE defines for release_option values and their meaning. */ -int dlm_release_lockspace(void *lockspace, int force) +int dlm_release_lockspace(void *lockspace, unsigned int release_option) { struct dlm_ls *ls; int error; @@ -803,7 +803,7 @@ int dlm_release_lockspace(void *lockspace, int force) dlm_put_lockspace(ls); mutex_lock(&ls_lock); - error = release_lockspace(ls, force); + error = release_lockspace(ls, release_option); if (!error) ls_count--; if (!ls_count) diff --git a/include/linux/dlm.h b/include/linux/dlm.h index 108eb953eb18..34015a008b80 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -122,7 +122,8 @@ int dlm_new_lockspace(const char *name, const char *cluster, * release_option: see DLM_RELEASE values above. */ -int dlm_release_lockspace(dlm_lockspace_t *lockspace, int release_option); +int dlm_release_lockspace(dlm_lockspace_t *lockspace, + unsigned int release_option); /* * dlm_lock -- cgit v1.2.3 From 8e40210788636619404871df07445fa4590138b4 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 14 Aug 2025 11:22:14 -0400 Subject: dlm: check for undefined release_option values Checking on all undefined release_option values to return -EINVAL in case a user is providing them to dlm_release_lockspace(). Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 3 +++ include/linux/dlm.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 8eadbf0fdf17..ddaa76558706 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -797,6 +797,9 @@ int dlm_release_lockspace(void *lockspace, unsigned int release_option) struct dlm_ls *ls; int error; + if (release_option > __DLM_RELEASE_MAX) + return -EINVAL; + ls = dlm_find_lockspace_local(lockspace); if (!ls) return -EINVAL; diff --git a/include/linux/dlm.h b/include/linux/dlm.h index 34015a008b80..7e7b45b0d097 100644 --- a/include/linux/dlm.h +++ b/include/linux/dlm.h @@ -113,6 +113,7 @@ int dlm_new_lockspace(const char *name, const char *cluster, #define DLM_RELEASE_NORMAL 2 #define DLM_RELEASE_NO_EVENT 3 #define DLM_RELEASE_RECOVER 4 +#define __DLM_RELEASE_MAX DLM_RELEASE_RECOVER /* * dlm_release_lockspace -- cgit v1.2.3 From dab32f2576a39d5f54f3dbbbc718d92fa5109ce9 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Thu, 7 Aug 2025 15:55:39 +0200 Subject: s390/pci: Use pci_uevent_ers() in PCI recovery Issue uevents on s390 during PCI recovery using pci_uevent_ers() as done by EEH and AER PCIe recovery routines. Signed-off-by: Niklas Schnelle Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Wunner Link: https://patch.msgid.link/20250807-add_err_uevents-v5-2-adf85b0620b0@linux.ibm.com --- arch/s390/pci/pci_event.c | 3 +++ drivers/pci/pci-driver.c | 2 +- include/linux/pci.h | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index d930416d4c90..b95376041501 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -88,6 +88,7 @@ static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev, pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; ers_res = driver->err_handler->error_detected(pdev, pdev->error_state); + pci_uevent_ers(pdev, ers_res); if (ers_result_indicates_abort(ers_res)) pr_info("%s: Automatic recovery failed after initial reporting\n", pci_name(pdev)); else if (ers_res == PCI_ERS_RESULT_NEED_RESET) @@ -244,6 +245,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) ers_res = PCI_ERS_RESULT_RECOVERED; if (ers_res != PCI_ERS_RESULT_RECOVERED) { + pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); pr_err("%s: Automatic recovery failed; operator intervention is required\n", pci_name(pdev)); status_str = "failed (driver can't recover)"; @@ -253,6 +255,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) pr_info("%s: The device is ready to resume operations\n", pci_name(pdev)); if (driver->err_handler->resume) driver->err_handler->resume(pdev); + pci_uevent_ers(pdev, PCI_ERS_RESULT_RECOVERED); out_unlock: pci_dev_unlock(pdev); zpci_report_status(zdev, "recovery", status_str); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 6405acdb5d0f..302d61783f6c 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1582,7 +1582,7 @@ static int pci_uevent(const struct device *dev, struct kobj_uevent_env *env) return 0; } -#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH) +#if defined(CONFIG_PCIEAER) || defined(CONFIG_EEH) || defined(CONFIG_S390) /** * pci_uevent_ers - emit a uevent during recovery path of PCI device * @pdev: PCI device undergoing error recovery diff --git a/include/linux/pci.h b/include/linux/pci.h index 59876de13860..7735acf6f349 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2764,7 +2764,7 @@ static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev) return false; } -#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH) +#if defined(CONFIG_PCIEPORTBUS) || defined(CONFIG_EEH) || defined(CONFIG_S390) void pci_uevent_ers(struct pci_dev *pdev, enum pci_ers_result err_type); #endif -- cgit v1.2.3 From f39494089aaa1022008eee245fb83ef1ae911b6d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Jul 2025 21:09:13 -0700 Subject: srcu: Move rcu_is_watching() checks to srcu_read_{,un}lock_fast() The rcu_is_watching() warnings are currently in the SRCU-tree implementations of __srcu_read_lock_fast() and __srcu_read_unlock_fast(). However, this makes it difficult to create _notrace variants of srcu_read_lock_fast() and srcu_read_unlock_fast(). This commit therefore moves these checks to srcu_read_lock_fast(), srcu_read_unlock_fast(), srcu_down_read_fast(), and srcu_up_read_fast(). Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: --- include/linux/srcu.h | 4 ++++ include/linux/srcutree.h | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index f179700fecaf..478c73d067f7 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -275,6 +275,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * { struct srcu_ctr __percpu *retval; + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); retval = __srcu_read_lock_fast(ssp); rcu_try_lock_acquire(&ssp->dep_map); @@ -295,6 +296,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_down_read_fast()."); srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); return __srcu_read_lock_fast(ssp); } @@ -389,6 +391,7 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock_fast(ssp, scp); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } /** @@ -405,6 +408,7 @@ static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); __srcu_read_unlock_fast(ssp, scp); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_up_read_fast()."); } /** diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index bf44d8d1e69e..043b5a67ef71 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -244,7 +244,6 @@ static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct { struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); - RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) this_cpu_inc(scp->srcu_locks.counter); /* Y */ else @@ -275,7 +274,6 @@ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ else atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); /* Z */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -- cgit v1.2.3 From 7e2a2d060da4860af37e1000dc62a30a1551d9e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Jul 2025 09:12:16 -0700 Subject: srcu: Add srcu_read_lock_fast_notrace() and srcu_read_unlock_fast_notrace() This commit adds no-trace variants of the srcu_read_lock_fast() and srcu_read_unlock_fast() functions for tracing use. [ paulmck: Apply notrace feedback from Joel Fernandes, Steven Rostedt, and Mathieu Desnoyers. ] [ paulmck: Apply excess-notrace feedback from Boqun Feng. ] Link: https://lore.kernel.org/all/20250721162433.10454-1-paulmck@kernel.org Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: --- include/linux/srcu.h | 25 +++++++++++++++++++++++++ include/linux/srcutree.h | 5 +++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 478c73d067f7..7a692bf8f99b 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -282,6 +282,20 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * return retval; } +/* + * Used by tracing, cannot be traced and cannot call lockdep. + * See srcu_read_lock_fast() for more information. + */ +static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_struct *ssp) + __acquires(ssp) +{ + struct srcu_ctr __percpu *retval; + + srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + retval = __srcu_read_lock_fast(ssp); + return retval; +} + /** * srcu_down_read_fast - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. @@ -394,6 +408,17 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } +/* + * Used by tracing, cannot be traced and cannot call lockdep. + * See srcu_read_unlock_fast() for more information. + */ +static inline void srcu_read_unlock_fast_notrace(struct srcu_struct *ssp, + struct srcu_ctr __percpu *scp) __releases(ssp) +{ + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); + __srcu_read_unlock_fast(ssp, scp); +} + /** * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 043b5a67ef71..4d2fee4d3828 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -240,7 +240,7 @@ static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ss * on architectures that support NMIs but do not supply NMI-safe * implementations of this_cpu_inc(). */ -static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp) +static inline struct srcu_ctr __percpu notrace *__srcu_read_lock_fast(struct srcu_struct *ssp) { struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); @@ -267,7 +267,8 @@ static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct * on architectures that support NMIs but do not supply NMI-safe * implementations of this_cpu_inc(). */ -static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +static inline void notrace +__srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) { barrier(); /* Avoid leaking the critical section. */ if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) -- cgit v1.2.3 From cacadb630375b8c30ca4d0300812178bb884c0b0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Jul 2025 09:19:39 -0700 Subject: srcu: Add guards for notrace variants of SRCU-fast readers This adds the usual scoped_guard(srcu_fast_notrace, &my_srcu) and guard(srcu_fast_notrace)(&my_srcu). Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: --- include/linux/srcu.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 7a692bf8f99b..ada65b58bc4c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -515,4 +515,9 @@ DEFINE_LOCK_GUARD_1(srcu_fast, struct srcu_struct, srcu_read_unlock_fast(_T->lock, _T->scp), struct srcu_ctr __percpu *scp) +DEFINE_LOCK_GUARD_1(srcu_fast_notrace, struct srcu_struct, + _T->scp = srcu_read_lock_fast_notrace(_T->lock), + srcu_read_unlock_fast_notrace(_T->lock, _T->scp), + struct srcu_ctr __percpu *scp) + #endif -- cgit v1.2.3 From 28f073b38372b99d8d33ff5e63897d28419bda20 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Mon, 16 Jun 2025 13:03:23 +0200 Subject: ice: Implement support for SRIOV VFs across Active/Active bonds This patch implements the software flows to handle SRIOV VF communication across an Active/Active link aggregate. The same restrictions apply as are in place for the support of Active/Backup bonds. - the two interfaces must be on the same NIC - the FW LLDP engine needs to be disabled - the DDP package that supports VF LAG must be loaded on device - the two interfaces must have the same QoS config - only the first interface added to the bond will have VF support - the interface with VFs must be in switchdev mode With the additional requirement of - the version of the FW on the NIC needs to have VF Active/Active support This requirement is indicated in the capabilities struct associated with the NVM loaded on the NIC. The balancing of traffic between the two interfaces is done on a queue basis. Taking the queues allocated to all of the VFs as a whole, one half of them will be distributed to each interface. When a link goes down, then the queues allocated to the down interface will migrate to the active port. When the down port comes back up, then the same queues as were originally assigned there will be moved back. Co-developed-by: Marcin Szycik Signed-off-by: Marcin Szycik Signed-off-by: Dave Ertman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 + drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 4 + drivers/net/ethernet/intel/ice/ice_common.c | 15 +- drivers/net/ethernet/intel/ice/ice_common.h | 2 +- drivers/net/ethernet/intel/ice/ice_lag.c | 772 +++++++++++++++++++++--- drivers/net/ethernet/intel/ice/ice_lag.h | 21 +- drivers/net/ethernet/intel/ice/ice_type.h | 1 + include/linux/net/intel/libie/adminq.h | 1 + 8 files changed, 713 insertions(+), 104 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 2098f00b3cd3..a83fdd22cbe4 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -203,6 +203,7 @@ enum ice_feature { ICE_F_GCS, ICE_F_ROCE_LAG, ICE_F_SRIOV_LAG, + ICE_F_SRIOV_AA_LAG, ICE_F_MBX_LIMIT, ICE_F_MAX }; diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 3bd3ea3af888..caae1780fd37 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -2060,6 +2060,10 @@ struct ice_aqc_cfg_txqs { #define ICE_AQC_Q_CFG_SRC_PRT_M 0x7 #define ICE_AQC_Q_CFG_DST_PRT_S 3 #define ICE_AQC_Q_CFG_DST_PRT_M (0x7 << ICE_AQC_Q_CFG_DST_PRT_S) +#define ICE_AQC_Q_CFG_MODE_M GENMASK(7, 6) +#define ICE_AQC_Q_CFG_MODE_SAME_PF 0x0 +#define ICE_AQC_Q_CFG_MODE_GIVE_OWN 0x1 +#define ICE_AQC_Q_CFG_MODE_KEEP_OWN 0x2 u8 time_out; #define ICE_AQC_Q_CFG_TIMEOUT_S 2 #define ICE_AQC_Q_CFG_TIMEOUT_M (0x1F << ICE_AQC_Q_CFG_TIMEOUT_S) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 209f42045fea..808870539667 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -2424,6 +2424,9 @@ ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps, caps->sriov_lag = number & LIBIE_AQC_BIT_SRIOV_LAG; ice_debug(hw, ICE_DBG_INIT, "%s: sriov_lag = %u\n", prefix, caps->sriov_lag); + caps->sriov_aa_lag = number & LIBIE_AQC_BIT_SRIOV_AA_LAG; + ice_debug(hw, ICE_DBG_INIT, "%s: sriov_aa_lag = %u\n", + prefix, caps->sriov_aa_lag); break; case LIBIE_AQC_CAPS_TX_SCHED_TOPO_COMP_MODE: caps->tx_sched_topo_comp_mode_en = (number == 1); @@ -4712,24 +4715,24 @@ do_aq: } /** - * ice_aq_cfg_lan_txq + * ice_aq_cfg_lan_txq - send AQ command 0x0C32 to FW * @hw: pointer to the hardware structure * @buf: buffer for command * @buf_size: size of buffer in bytes * @num_qs: number of queues being configured * @oldport: origination lport * @newport: destination lport + * @mode: cmd_type for move to use * @cd: pointer to command details structure or NULL * * Move/Configure LAN Tx queue (0x0C32) * - * There is a better AQ command to use for moving nodes, so only coding - * this one for configuring the node. + * Return: Zero on success, associated error code on failure. */ int ice_aq_cfg_lan_txq(struct ice_hw *hw, struct ice_aqc_cfg_txqs_buf *buf, u16 buf_size, u16 num_qs, u8 oldport, u8 newport, - struct ice_sq_cd *cd) + u8 mode, struct ice_sq_cd *cd) { struct ice_aqc_cfg_txqs *cmd; struct libie_aq_desc desc; @@ -4742,10 +4745,12 @@ ice_aq_cfg_lan_txq(struct ice_hw *hw, struct ice_aqc_cfg_txqs_buf *buf, if (!buf) return -EINVAL; - cmd->cmd_type = ICE_AQC_Q_CFG_TC_CHNG; + cmd->cmd_type = mode; cmd->num_qs = num_qs; cmd->port_num_chng = (oldport & ICE_AQC_Q_CFG_SRC_PRT_M); cmd->port_num_chng |= FIELD_PREP(ICE_AQC_Q_CFG_DST_PRT_M, newport); + cmd->port_num_chng |= FIELD_PREP(ICE_AQC_Q_CFG_MODE_M, + ICE_AQC_Q_CFG_MODE_KEEP_OWN); cmd->time_out = FIELD_PREP(ICE_AQC_Q_CFG_TIMEOUT_M, 5); cmd->blocked_cgds = 0; diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h index 60320cdf7804..dba15ad315a6 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.h +++ b/drivers/net/ethernet/intel/ice/ice_common.h @@ -270,7 +270,7 @@ ice_ena_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 q_handle, int ice_aq_cfg_lan_txq(struct ice_hw *hw, struct ice_aqc_cfg_txqs_buf *buf, u16 buf_size, u16 num_qs, u8 oldport, u8 newport, - struct ice_sq_cd *cd); + u8 mode, struct ice_sq_cd *cd); int ice_replay_vsi(struct ice_hw *hw, u16 vsi_handle); void ice_replay_post(struct ice_hw *hw); struct ice_q_ctx * diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index 72284555b98a..80312e1dcf7f 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -14,6 +14,9 @@ static const u8 lacp_train_pkt[ICE_TRAIN_PKT_LEN] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x88, 0x09, 0, 0 }; +static const u8 act_act_train_pkt[ICE_TRAIN_PKT_LEN] = { 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 }; #define ICE_RECIPE_LEN 64 #define ICE_LAG_SRIOV_CP_RECIPE 10 @@ -48,10 +51,10 @@ static void ice_lag_set_primary(struct ice_lag *lag) } /** - * ice_lag_set_backup - set PF LAG state to Backup + * ice_lag_set_bkup - set PF LAG state to Backup * @lag: LAG info struct */ -static void ice_lag_set_backup(struct ice_lag *lag) +static void ice_lag_set_bkup(struct ice_lag *lag) { struct ice_pf *pf = lag->pf; @@ -337,25 +340,15 @@ ice_lag_cfg_drop_fltr(struct ice_lag *lag, bool add) } /** - * ice_lag_cfg_pf_fltrs - set filters up for new active port + * ice_lag_cfg_pf_fltrs_act_bkup - set filters up for new active port * @lag: local interfaces lag struct - * @ptr: opaque data containing notifier event + * @bonding_info: netdev event bonding info */ static void -ice_lag_cfg_pf_fltrs(struct ice_lag *lag, void *ptr) +ice_lag_cfg_pf_fltrs_act_bkup(struct ice_lag *lag, + struct netdev_bonding_info *bonding_info) { - struct netdev_notifier_bonding_info *info = ptr; - struct netdev_bonding_info *bonding_info; - struct net_device *event_netdev; - struct device *dev; - - event_netdev = netdev_notifier_info_to_dev(ptr); - /* not for this netdev */ - if (event_netdev != lag->netdev) - return; - - bonding_info = &info->bonding_info; - dev = ice_pf_to_dev(lag->pf); + struct device *dev = ice_pf_to_dev(lag->pf); /* interface not active - remove old default VSI rule */ if (bonding_info->slave.state && lag->pf_rx_rule_id) { @@ -376,12 +369,13 @@ ice_lag_cfg_pf_fltrs(struct ice_lag *lag, void *ptr) } /** - * ice_lag_cfg_cp_fltr - configure filter for control packets + * ice_lag_cfg_lp_fltr - configure lport filters * @lag: local interface's lag struct * @add: add or remove rule + * @cp: control packet only or general PF lport rule */ static void -ice_lag_cfg_cp_fltr(struct ice_lag *lag, bool add) +ice_lag_cfg_lp_fltr(struct ice_lag *lag, bool add, bool cp) { struct ice_sw_rule_lkup_rx_tx *s_rule; struct ice_vsi *vsi = lag->pf->vsi[0]; @@ -395,8 +389,17 @@ ice_lag_cfg_cp_fltr(struct ice_lag *lag, bool add) } if (add) { - s_rule->hdr.type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX); - s_rule->recipe_id = cpu_to_le16(ICE_LAG_SRIOV_CP_RECIPE); + if (cp) { + s_rule->recipe_id = + cpu_to_le16(ICE_LAG_SRIOV_CP_RECIPE); + memcpy(s_rule->hdr_data, lacp_train_pkt, + ICE_TRAIN_PKT_LEN); + } else { + s_rule->recipe_id = cpu_to_le16(lag->act_act_recipe); + memcpy(s_rule->hdr_data, act_act_train_pkt, + ICE_TRAIN_PKT_LEN); + } + s_rule->src = cpu_to_le16(vsi->port_info->lport); s_rule->act = cpu_to_le32(ICE_FWD_TO_VSI | ICE_SINGLE_ACT_LAN_ENABLE | @@ -404,27 +407,66 @@ ice_lag_cfg_cp_fltr(struct ice_lag *lag, bool add) FIELD_PREP(ICE_SINGLE_ACT_VSI_ID_M, vsi->vsi_num)); s_rule->hdr_len = cpu_to_le16(ICE_TRAIN_PKT_LEN); - memcpy(s_rule->hdr_data, lacp_train_pkt, ICE_TRAIN_PKT_LEN); + s_rule->hdr.type = cpu_to_le16(ICE_AQC_SW_RULES_T_LKUP_RX); opc = ice_aqc_opc_add_sw_rules; } else { opc = ice_aqc_opc_remove_sw_rules; - s_rule->index = cpu_to_le16(lag->cp_rule_idx); + if (cp) + s_rule->index = cpu_to_le16(lag->cp_rule_idx); + else + s_rule->index = cpu_to_le16(lag->act_act_rule_idx); } if (ice_aq_sw_rules(&lag->pf->hw, s_rule, buf_len, 1, opc, NULL)) { - netdev_warn(lag->netdev, "Error %s CP rule for fail-over\n", - add ? "ADDING" : "REMOVING"); + netdev_warn(lag->netdev, "Error %s %s rule for aggregate\n", + add ? "ADDING" : "REMOVING", + cp ? "CONTROL PACKET" : "LPORT"); goto err_cp_free; } - if (add) - lag->cp_rule_idx = le16_to_cpu(s_rule->index); - else - lag->cp_rule_idx = 0; + if (add) { + if (cp) + lag->cp_rule_idx = le16_to_cpu(s_rule->index); + else + lag->act_act_rule_idx = le16_to_cpu(s_rule->index); + } else { + if (cp) + lag->cp_rule_idx = 0; + else + lag->act_act_rule_idx = 0; + } err_cp_free: kfree(s_rule); } +/** + * ice_lag_cfg_pf_fltrs - set filters up for PF traffic + * @lag: local interfaces lag struct + * @ptr: opaque data containing notifier event + */ +static void +ice_lag_cfg_pf_fltrs(struct ice_lag *lag, void *ptr) +{ + struct netdev_notifier_bonding_info *info = ptr; + struct netdev_bonding_info *bonding_info; + struct net_device *event_netdev; + + event_netdev = netdev_notifier_info_to_dev(ptr); + if (event_netdev != lag->netdev) + return; + + bonding_info = &info->bonding_info; + + if (lag->bond_aa) { + if (lag->need_fltr_cfg) { + ice_lag_cfg_lp_fltr(lag, true, false); + lag->need_fltr_cfg = false; + } + } else { + ice_lag_cfg_pf_fltrs_act_bkup(lag, bonding_info); + } +} + /** * ice_display_lag_info - print LAG info * @lag: LAG info struct @@ -648,7 +690,7 @@ ice_lag_move_vf_node_tc(struct ice_lag *lag, u8 oldport, u8 newport, } if (ice_aq_cfg_lan_txq(&lag->pf->hw, qbuf, qbuf_size, valq, oldport, - newport, NULL)) { + newport, ICE_AQC_Q_CFG_TC_CHNG, NULL)) { dev_warn(dev, "Failure to configure queues for LAG failover\n"); goto qbuf_err; } @@ -784,10 +826,17 @@ void ice_lag_move_new_vf_nodes(struct ice_vf *vf) if (lag->upper_netdev) ice_lag_build_netdev_list(lag, &ndlist); - if (ice_is_feature_supported(pf, ICE_F_SRIOV_LAG) && - lag->bonded && lag->primary && pri_port != act_port && - !list_empty(lag->netdev_head)) - ice_lag_move_single_vf_nodes(lag, pri_port, act_port, vsi->idx); + if (lag->bonded && lag->primary && !list_empty(lag->netdev_head)) { + if (lag->bond_aa && + ice_is_feature_supported(pf, ICE_F_SRIOV_AA_LAG)) + ice_lag_aa_failover(lag, ICE_LAGS_IDX, NULL); + + if (!lag->bond_aa && + ice_is_feature_supported(pf, ICE_F_SRIOV_LAG) && + pri_port != act_port) + ice_lag_move_single_vf_nodes(lag, pri_port, act_port, + vsi->idx); + } ice_lag_destroy_netdev_list(lag, &ndlist); @@ -851,11 +900,20 @@ u8 ice_lag_prepare_vf_reset(struct ice_lag *lag) u8 pri_prt, act_prt; if (lag && lag->bonded && lag->primary && lag->upper_netdev) { - pri_prt = lag->pf->hw.port_info->lport; - act_prt = lag->active_port; - if (act_prt != pri_prt && act_prt != ICE_LAG_INVALID_PORT) { - ice_lag_move_vf_nodes_cfg(lag, act_prt, pri_prt); - return act_prt; + if (!lag->bond_aa) { + pri_prt = lag->pf->hw.port_info->lport; + act_prt = lag->active_port; + if (act_prt != pri_prt && + act_prt != ICE_LAG_INVALID_PORT) { + ice_lag_move_vf_nodes_cfg(lag, act_prt, pri_prt); + return act_prt; + } + } else { + if (lag->port_bitmap & ICE_LAGS_M) { + lag->port_bitmap &= ~ICE_LAGS_M; + ice_lag_aa_failover(lag, ICE_LAGP_IDX, NULL); + lag->port_bitmap |= ICE_LAGS_M; + } } } @@ -873,10 +931,15 @@ void ice_lag_complete_vf_reset(struct ice_lag *lag, u8 act_prt) { u8 pri_prt; - if (lag && lag->bonded && lag->primary && - act_prt != ICE_LAG_INVALID_PORT) { - pri_prt = lag->pf->hw.port_info->lport; - ice_lag_move_vf_nodes_cfg(lag, pri_prt, act_prt); + if (lag && lag->bonded && lag->primary) { + if (!lag->bond_aa) { + pri_prt = lag->pf->hw.port_info->lport; + if (act_prt != ICE_LAG_INVALID_PORT) + ice_lag_move_vf_nodes_cfg(lag, pri_prt, + act_prt); + } else { + ice_lag_aa_failover(lag, ICE_LAGS_IDX, NULL); + } } } @@ -912,7 +975,7 @@ static void ice_lag_info_event(struct ice_lag *lag, void *ptr) } if (bonding_info->slave.state) - ice_lag_set_backup(lag); + ice_lag_set_bkup(lag); else ice_lag_set_primary(lag); @@ -920,6 +983,295 @@ lag_out: ice_display_lag_info(lag); } +/** + * ice_lag_aa_qbuf_recfg - fill a single queue buffer for recfg cmd + * @hw: HW struct that contains the queue context + * @qbuf: pointer to single queue buffer + * @vsi_num: index of the VF VSI in PF space + * @qnum: queue index + * + * Return: Zero on success, error code on failure. + */ +static int +ice_lag_aa_qbuf_recfg(struct ice_hw *hw, struct ice_aqc_cfg_txqs_buf *qbuf, + u16 vsi_num, int qnum) +{ + struct ice_pf *pf = hw->back; + struct ice_q_ctx *q_ctx; + u16 q_id; + + q_ctx = ice_get_lan_q_ctx(hw, vsi_num, 0, qnum); + if (!q_ctx) { + dev_dbg(ice_hw_to_dev(hw), "LAG queue %d no Q context\n", qnum); + return -ENOENT; + } + + if (q_ctx->q_teid == ICE_INVAL_TEID) { + dev_dbg(ice_hw_to_dev(hw), "LAG queue %d INVAL TEID\n", qnum); + return -EINVAL; + } + + if (q_ctx->q_handle == ICE_INVAL_Q_HANDLE) { + dev_dbg(ice_hw_to_dev(hw), "LAG queue %d INVAL Q HANDLE\n", qnum); + return -EINVAL; + } + + q_id = pf->vsi[vsi_num]->txq_map[q_ctx->q_handle]; + qbuf->queue_info[0].q_handle = cpu_to_le16(q_id); + qbuf->queue_info[0].tc = 0; + qbuf->queue_info[0].q_teid = cpu_to_le32(q_ctx->q_teid); + + return 0; +} + +/** + * ice_lag_aa_move_vf_qs - Move some/all VF queues to destination + * @lag: primary interface's lag struct + * @dest: index of destination port + * @vsi_num: index of VF VSI in PF space + * @all: if true move all queues to destination + * @odd: VF wide q indicator for odd/even + * @e_pf: PF struct for the event interface + * + * the parameter "all" is to control whether we are splitting the queues + * between two interfaces or moving them all to the destination interface + */ +static void ice_lag_aa_move_vf_qs(struct ice_lag *lag, u8 dest, u16 vsi_num, + bool all, bool *odd, struct ice_pf *e_pf) +{ + DEFINE_RAW_FLEX(struct ice_aqc_cfg_txqs_buf, qbuf, queue_info, 1); + struct ice_hw *old_hw, *new_hw, *pri_hw, *sec_hw; + struct device *dev = ice_pf_to_dev(lag->pf); + struct ice_vsi_ctx *pv_ctx, *sv_ctx; + struct ice_lag_netdev_list ndlist; + u16 num_q, qbuf_size, sec_vsi_num; + u8 pri_lport, sec_lport; + u32 pvf_teid, svf_teid; + u16 vf_id; + + vf_id = lag->pf->vsi[vsi_num]->vf->vf_id; + /* If sec_vf[] not defined, then no second interface to share with */ + if (lag->sec_vf[vf_id]) + sec_vsi_num = lag->sec_vf[vf_id]->idx; + else + return; + + pri_lport = lag->bond_lport_pri; + sec_lport = lag->bond_lport_sec; + + if (pri_lport == ICE_LAG_INVALID_PORT || + sec_lport == ICE_LAG_INVALID_PORT) + return; + + if (!e_pf) + ice_lag_build_netdev_list(lag, &ndlist); + + pri_hw = &lag->pf->hw; + if (e_pf && lag->pf != e_pf) + sec_hw = &e_pf->hw; + else + sec_hw = ice_lag_find_hw_by_lport(lag, sec_lport); + + if (!pri_hw || !sec_hw) + return; + + if (dest == ICE_LAGP_IDX) { + struct ice_vsi *vsi; + + vsi = ice_get_main_vsi(lag->pf); + if (!vsi) + return; + + old_hw = sec_hw; + new_hw = pri_hw; + ice_lag_config_eswitch(lag, vsi->netdev); + } else { + struct ice_pf *sec_pf = sec_hw->back; + struct ice_vsi *vsi; + + vsi = ice_get_main_vsi(sec_pf); + if (!vsi) + return; + + old_hw = pri_hw; + new_hw = sec_hw; + ice_lag_config_eswitch(lag, vsi->netdev); + } + + pv_ctx = ice_get_vsi_ctx(pri_hw, vsi_num); + if (!pv_ctx) { + dev_warn(dev, "Unable to locate primary VSI %d context for LAG failover\n", + vsi_num); + return; + } + + sv_ctx = ice_get_vsi_ctx(sec_hw, sec_vsi_num); + if (!sv_ctx) { + dev_warn(dev, "Unable to locate secondary VSI %d context for LAG failover\n", + vsi_num); + return; + } + + num_q = pv_ctx->num_lan_q_entries[0]; + qbuf_size = __struct_size(qbuf); + + /* Suspend traffic for primary VSI VF */ + pvf_teid = le32_to_cpu(pv_ctx->sched.vsi_node[0]->info.node_teid); + ice_sched_suspend_resume_elems(pri_hw, 1, &pvf_teid, true); + + /* Suspend traffic for secondary VSI VF */ + svf_teid = le32_to_cpu(sv_ctx->sched.vsi_node[0]->info.node_teid); + ice_sched_suspend_resume_elems(sec_hw, 1, &svf_teid, true); + + for (int i = 0; i < num_q; i++) { + struct ice_sched_node *n_prt, *q_node, *parent; + struct ice_port_info *pi, *new_pi; + struct ice_vsi_ctx *src_ctx; + struct ice_sched_node *p; + struct ice_q_ctx *q_ctx; + u16 dst_vsi_num; + + pi = old_hw->port_info; + new_pi = new_hw->port_info; + + *odd = !(*odd); + if ((dest == ICE_LAGP_IDX && *odd && !all) || + (dest == ICE_LAGS_IDX && !(*odd) && !all) || + lag->q_home[vf_id][i] == dest) + continue; + + if (dest == ICE_LAGP_IDX) + dst_vsi_num = vsi_num; + else + dst_vsi_num = sec_vsi_num; + + n_prt = ice_sched_get_free_qparent(new_hw->port_info, + dst_vsi_num, 0, + ICE_SCHED_NODE_OWNER_LAN); + if (!n_prt) + continue; + + q_ctx = ice_get_lan_q_ctx(pri_hw, vsi_num, 0, i); + if (!q_ctx) + continue; + + if (dest == ICE_LAGP_IDX) + src_ctx = sv_ctx; + else + src_ctx = pv_ctx; + + q_node = ice_sched_find_node_by_teid(src_ctx->sched.vsi_node[0], + q_ctx->q_teid); + if (!q_node) + continue; + + qbuf->src_parent_teid = q_node->info.parent_teid; + qbuf->dst_parent_teid = n_prt->info.node_teid; + + /* Move the node in the HW/FW */ + if (ice_lag_aa_qbuf_recfg(pri_hw, qbuf, vsi_num, i)) + continue; + + if (dest == ICE_LAGP_IDX) + ice_aq_cfg_lan_txq(pri_hw, qbuf, qbuf_size, 1, + sec_lport, pri_lport, + ICE_AQC_Q_CFG_MOVE_TC_CHNG, + NULL); + else + ice_aq_cfg_lan_txq(pri_hw, qbuf, qbuf_size, 1, + pri_lport, sec_lport, + ICE_AQC_Q_CFG_MOVE_TC_CHNG, + NULL); + + /* Move the node in the SW */ + parent = q_node->parent; + if (!parent) + continue; + + for (int n = 0; n < parent->num_children; n++) { + int j; + + if (parent->children[n] != q_node) + continue; + + for (j = n + 1; j < parent->num_children; + j++) { + parent->children[j - 1] = + parent->children[j]; + } + parent->children[j] = NULL; + parent->num_children--; + break; + } + + p = pi->sib_head[0][q_node->tx_sched_layer]; + while (p) { + if (p->sibling == q_node) { + p->sibling = q_node->sibling; + break; + } + p = p->sibling; + } + + if (pi->sib_head[0][q_node->tx_sched_layer] == q_node) + pi->sib_head[0][q_node->tx_sched_layer] = + q_node->sibling; + + q_node->parent = n_prt; + q_node->info.parent_teid = n_prt->info.node_teid; + q_node->sibling = NULL; + p = new_pi->sib_head[0][q_node->tx_sched_layer]; + if (p) { + while (p) { + if (!p->sibling) { + p->sibling = q_node; + break; + } + p = p->sibling; + } + } else { + new_pi->sib_head[0][q_node->tx_sched_layer] = + q_node; + } + + n_prt->children[n_prt->num_children++] = q_node; + lag->q_home[vf_id][i] = dest; + } + + ice_sched_suspend_resume_elems(pri_hw, 1, &pvf_teid, false); + ice_sched_suspend_resume_elems(sec_hw, 1, &svf_teid, false); + + if (!e_pf) + ice_lag_destroy_netdev_list(lag, &ndlist); +} + +/** + * ice_lag_aa_failover - move VF queues in A/A mode + * @lag: primary lag struct + * @dest: index of destination port + * @e_pf: PF struct for event port + */ +void ice_lag_aa_failover(struct ice_lag *lag, u8 dest, struct ice_pf *e_pf) +{ + bool odd = true, all = false; + int i; + + /* Primary can be a target if down (cleanup), but secondary can't */ + if (dest == ICE_LAGS_IDX && !(lag->port_bitmap & ICE_LAGS_M)) + return; + + /* Move all queues to a destination if only one port is active, + * or no ports are active and dest is primary. + */ + if ((lag->port_bitmap ^ (ICE_LAGP_M | ICE_LAGS_M)) || + (!lag->port_bitmap && dest == ICE_LAGP_IDX)) + all = true; + + ice_for_each_vsi(lag->pf, i) + if (lag->pf->vsi[i] && lag->pf->vsi[i]->type == ICE_VSI_VF) + ice_lag_aa_move_vf_qs(lag, dest, i, all, &odd, e_pf); +} + /** * ice_lag_reclaim_vf_tc - move scheduling nodes back to primary interface * @lag: primary interface lag struct @@ -982,7 +1334,7 @@ ice_lag_reclaim_vf_tc(struct ice_lag *lag, struct ice_hw *src_hw, u16 vsi_num, if (ice_aq_cfg_lan_txq(hw, qbuf, qbuf_size, numq, src_hw->port_info->lport, hw->port_info->lport, - NULL)) { + ICE_AQC_Q_CFG_TC_CHNG, NULL)) { dev_warn(dev, "Failure to configure queues for LAG failover\n"); goto reclaim_qerr; } @@ -1053,14 +1405,15 @@ static void ice_lag_link(struct ice_lag *lag) lag->bonded = true; lag->role = ICE_LAG_UNSET; + lag->need_fltr_cfg = true; netdev_info(lag->netdev, "Shared SR-IOV resources in bond are active\n"); } /** - * ice_lag_unlink - handle unlink event + * ice_lag_act_bkup_unlink - handle unlink event for A/B bond * @lag: LAG info struct */ -static void ice_lag_unlink(struct ice_lag *lag) +static void ice_lag_act_bkup_unlink(struct ice_lag *lag) { u8 pri_port, act_port, loc_port; struct ice_pf *pf = lag->pf; @@ -1096,10 +1449,32 @@ static void ice_lag_unlink(struct ice_lag *lag) } } } +} - lag->bonded = false; - lag->role = ICE_LAG_NONE; - lag->upper_netdev = NULL; +/** + * ice_lag_aa_unlink - handle unlink event for Active-Active bond + * @lag: LAG info struct + */ +static void ice_lag_aa_unlink(struct ice_lag *lag) +{ + struct ice_lag *pri_lag; + + if (lag->primary) { + pri_lag = lag; + lag->port_bitmap &= ~ICE_LAGP_M; + } else { + pri_lag = ice_lag_find_primary(lag); + if (pri_lag) + pri_lag->port_bitmap &= ICE_LAGS_M; + } + + if (pri_lag) { + ice_lag_aa_failover(pri_lag, ICE_LAGP_IDX, lag->pf); + if (lag->primary) + pri_lag->bond_lport_pri = ICE_LAG_INVALID_PORT; + else + pri_lag->bond_lport_sec = ICE_LAG_INVALID_PORT; + } } /** @@ -1115,10 +1490,20 @@ static void ice_lag_link_unlink(struct ice_lag *lag, void *ptr) if (netdev != lag->netdev) return; - if (info->linking) + if (info->linking) { ice_lag_link(lag); - else - ice_lag_unlink(lag); + } else { + if (lag->bond_aa) + ice_lag_aa_unlink(lag); + else + ice_lag_act_bkup_unlink(lag); + + lag->bonded = false; + lag->role = ICE_LAG_NONE; + lag->upper_netdev = NULL; + lag->bond_aa = false; + lag->need_fltr_cfg = false; + } } /** @@ -1320,6 +1705,11 @@ static void ice_lag_init_feature_support_flag(struct ice_pf *pf) ice_set_feature_support(pf, ICE_F_SRIOV_LAG); else ice_clear_feature_support(pf, ICE_F_SRIOV_LAG); + + if (caps->sriov_aa_lag && ice_pkg_has_lport_extract(&pf->hw)) + ice_set_feature_support(pf, ICE_F_SRIOV_AA_LAG); + else + ice_clear_feature_support(pf, ICE_F_SRIOV_AA_LAG); } /** @@ -1353,6 +1743,9 @@ static void ice_lag_changeupper_event(struct ice_lag *lag, void *ptr) /* Configure primary's SWID to be shared */ ice_lag_primary_swid(lag, true); primary_lag = lag; + lag->bond_lport_pri = lag->pf->hw.port_info->lport; + lag->bond_lport_sec = ICE_LAG_INVALID_PORT; + lag->port_bitmap = 0; } else { u16 swid; @@ -1362,16 +1755,29 @@ static void ice_lag_changeupper_event(struct ice_lag *lag, void *ptr) swid = primary_lag->pf->hw.port_info->sw_id; ice_lag_set_swid(swid, lag, true); ice_lag_add_prune_list(primary_lag, lag->pf); - ice_lag_cfg_drop_fltr(lag, true); + primary_lag->bond_lport_sec = + lag->pf->hw.port_info->lport; } /* add filter for primary control packets */ - ice_lag_cfg_cp_fltr(lag, true); + ice_lag_cfg_lp_fltr(lag, true, true); } else { if (!primary_lag && lag->primary) primary_lag = lag; + if (primary_lag) { + for (int i = 0; i < ICE_MAX_SRIOV_VFS; i++) { + if (primary_lag->sec_vf[i]) { + ice_vsi_release(primary_lag->sec_vf[i]); + primary_lag->sec_vf[i] = NULL; + } + } + } + if (!lag->primary) { ice_lag_set_swid(0, lag, false); + if (primary_lag) + primary_lag->bond_lport_sec = + ICE_LAG_INVALID_PORT; } else { if (primary_lag && lag->primary) { ice_lag_primary_swid(lag, false); @@ -1379,7 +1785,7 @@ static void ice_lag_changeupper_event(struct ice_lag *lag, void *ptr) } } /* remove filter for control packets */ - ice_lag_cfg_cp_fltr(lag, false); + ice_lag_cfg_lp_fltr(lag, false, !lag->bond_aa); } } @@ -1405,18 +1811,34 @@ static void ice_lag_monitor_link(struct ice_lag *lag, void *ptr) if (!netif_is_same_ice(lag->pf, event_netdev)) return; + if (info->upper_dev != lag->upper_netdev) + return; + + if (info->linking) + return; + pf = lag->pf; prim_hw = &pf->hw; prim_port = prim_hw->port_info->lport; - if (info->upper_dev != lag->upper_netdev) - return; - - if (!info->linking) { - /* Since there are only two interfaces allowed in SRIOV+LAG, if - * one port is leaving, then nodes need to be on primary - * interface. - */ + /* Since there are only two interfaces allowed in SRIOV+LAG, if + * one port is leaving, then nodes need to be on primary + * interface. + */ + if (lag->bond_aa) { + struct ice_netdev_priv *e_ndp; + struct ice_pf *e_pf; + + e_ndp = netdev_priv(event_netdev); + e_pf = e_ndp->vsi->back; + + if (lag->bond_lport_pri != ICE_LAG_INVALID_PORT && + lag->port_bitmap & ICE_LAGS_M) { + lag->port_bitmap &= ~ICE_LAGS_M; + ice_lag_aa_failover(lag, ICE_LAGP_IDX, e_pf); + lag->bond_lport_sec = ICE_LAG_INVALID_PORT; + } + } else { if (prim_port != lag->active_port && lag->active_port != ICE_LAG_INVALID_PORT) { active_hw = ice_lag_find_hw_by_lport(lag, @@ -1428,44 +1850,32 @@ static void ice_lag_monitor_link(struct ice_lag *lag, void *ptr) } /** - * ice_lag_monitor_active - main PF keep track of which port is active + * ice_lag_monitor_act_bkup - keep track of which port is active in A/B LAG * @lag: lag info struct - * @ptr: opaque data containing notifier event + * @b_info: bonding info + * @event_netdev: net_device got target netdev * * This function is for the primary PF to monitor changes in which port is * active and handle changes for SRIOV VF functionality */ -static void ice_lag_monitor_active(struct ice_lag *lag, void *ptr) +static void ice_lag_monitor_act_bkup(struct ice_lag *lag, + struct netdev_bonding_info *b_info, + struct net_device *event_netdev) { - struct netdev_notifier_bonding_info *info = ptr; - struct net_device *event_netdev, *event_upper; - struct netdev_bonding_info *bonding_info; struct ice_netdev_priv *event_np; struct ice_pf *pf, *event_pf; u8 prim_port, event_port; - if (!lag->primary) - return; - pf = lag->pf; if (!pf) return; - event_netdev = netdev_notifier_info_to_dev(ptr); - rcu_read_lock(); - event_upper = netdev_master_upper_dev_get_rcu(event_netdev); - rcu_read_unlock(); - if (!netif_is_ice(event_netdev) || event_upper != lag->upper_netdev) - return; - event_np = netdev_priv(event_netdev); event_pf = event_np->vsi->back; event_port = event_pf->hw.port_info->lport; prim_port = pf->hw.port_info->lport; - bonding_info = &info->bonding_info; - - if (!bonding_info->slave.state) { + if (!b_info->slave.state) { /* if no port is currently active, then nodes and filters exist * on primary port, check if we need to move them */ @@ -1501,6 +1911,128 @@ static void ice_lag_monitor_active(struct ice_lag *lag, void *ptr) } } +/** + * ice_lag_aa_clear_spoof - adjust the placeholder VSI spoofing for A/A LAG + * @vsi: placeholder VSI to adjust + */ +static void ice_lag_aa_clear_spoof(struct ice_vsi *vsi) +{ + ice_vsi_update_security(vsi, ice_vsi_ctx_clear_antispoof); +} + +/** + * ice_lag_monitor_act_act - Keep track of active ports in A/A LAG + * @lag: lag struct for primary interface + * @b_info: bonding_info for event + * @event_netdev: net_device for target netdev + */ +static void ice_lag_monitor_act_act(struct ice_lag *lag, + struct netdev_bonding_info *b_info, + struct net_device *event_netdev) +{ + struct ice_netdev_priv *event_np; + u8 prim_port, event_port; + struct ice_pf *event_pf; + + event_np = netdev_priv(event_netdev); + event_pf = event_np->vsi->back; + event_port = event_pf->hw.port_info->lport; + prim_port = lag->pf->hw.port_info->lport; + + if (b_info->slave.link == BOND_LINK_UP) { + /* Port is coming up */ + if (prim_port == event_port) { + /* Processing event for primary interface */ + if (lag->bond_lport_pri == ICE_LAG_INVALID_PORT) + return; + + if (!(lag->port_bitmap & ICE_LAGP_M)) { + /* Primary port was not marked up before, move + * some|all VF queues to it and mark as up + */ + lag->port_bitmap |= ICE_LAGP_M; + ice_lag_aa_failover(lag, ICE_LAGP_IDX, event_pf); + } + } else { + if (lag->bond_lport_sec == ICE_LAG_INVALID_PORT) + return; + + /* Create placeholder VSIs on secondary PF. + * The placeholder is necessary so that we have + * an element that represents the VF on the secondary + * interface's scheduling tree. This will be a tree + * root for scheduling nodes when they are moved to + * the secondary interface. + */ + if (!lag->sec_vf[0]) { + struct ice_vsi_cfg_params params = {}; + struct ice_vsi *nvsi; + struct ice_vf *vf; + unsigned int bkt; + + params.type = ICE_VSI_VF; + params.port_info = event_pf->hw.port_info; + params.flags = ICE_VSI_FLAG_INIT; + + ice_for_each_vf(lag->pf, bkt, vf) { + params.vf = vf; + nvsi = ice_vsi_setup(event_pf, + ¶ms); + ice_lag_aa_clear_spoof(nvsi); + lag->sec_vf[vf->vf_id] = nvsi; + } + } + + if (!(lag->port_bitmap & ICE_LAGS_M)) { + /* Secondary port was not marked up before, + * move some|all VF queues to it and mark as up + */ + lag->port_bitmap |= ICE_LAGS_M; + ice_lag_aa_failover(lag, ICE_LAGS_IDX, event_pf); + } + } + } else { + /* Port is going down */ + if (prim_port == event_port) { + lag->port_bitmap &= ~ICE_LAGP_M; + ice_lag_aa_failover(lag, ICE_LAGS_IDX, event_pf); + } else { + lag->port_bitmap &= ~ICE_LAGS_M; + ice_lag_aa_failover(lag, ICE_LAGP_IDX, event_pf); + } + } +} + +/** + * ice_lag_monitor_info - Calls relevant A/A or A/B monitoring function + * @lag: lag info struct + * @ptr: opaque data containing notifier event + * + * This function is for the primary PF to monitor changes in which port is + * active and handle changes for SRIOV VF functionality + */ +static void ice_lag_monitor_info(struct ice_lag *lag, void *ptr) +{ + struct netdev_notifier_bonding_info *info = ptr; + struct net_device *event_netdev, *event_upper; + struct netdev_bonding_info *bonding_info; + + if (!lag->primary) + return; + + event_netdev = netdev_notifier_info_to_dev(ptr); + bonding_info = &info->bonding_info; + rcu_read_lock(); + event_upper = netdev_master_upper_dev_get_rcu(event_netdev); + rcu_read_unlock(); + if (!netif_is_ice(event_netdev) || event_upper != lag->upper_netdev) + return; + + if (lag->bond_aa) + ice_lag_monitor_act_act(lag, bonding_info, event_netdev); + else + ice_lag_monitor_act_bkup(lag, bonding_info, event_netdev); +} /** * ice_lag_chk_comp - evaluate bonded interface for feature support * @lag: lag info struct @@ -1516,6 +2048,14 @@ ice_lag_chk_comp(struct ice_lag *lag, void *ptr) struct device *dev; int count = 0; + /* All members need to know if bond A/A or A/B */ + bonding_info = &info->bonding_info; + lag->bond_mode = bonding_info->master.bond_mode; + if (lag->bond_mode != BOND_MODE_ACTIVEBACKUP) + lag->bond_aa = true; + else + lag->bond_aa = false; + if (!lag->primary) return true; @@ -1536,12 +2076,9 @@ ice_lag_chk_comp(struct ice_lag *lag, void *ptr) return false; } - bonding_info = &info->bonding_info; - lag->bond_mode = bonding_info->master.bond_mode; - if (lag->bond_mode != BOND_MODE_ACTIVEBACKUP) { - dev_info(dev, "Bond Mode not ACTIVE-BACKUP - VF LAG disabled\n"); + if (lag->bond_aa && !ice_is_feature_supported(lag->pf, + ICE_F_SRIOV_AA_LAG)) return false; - } list_for_each(tmp, lag->netdev_head) { struct ice_dcbx_cfg *dcb_cfg, *peer_dcb_cfg; @@ -1699,6 +2236,26 @@ static void ice_lag_disable_sriov_bond(struct ice_lag *lag) struct ice_pf *pf = np->vsi->back; ice_clear_feature_support(pf, ICE_F_SRIOV_LAG); + ice_clear_feature_support(pf, ICE_F_SRIOV_AA_LAG); +} + +/** + * ice_lag_preset_drop_fltr - preset drop filter for A/B bonds + * @lag: local lag struct + * @ptr: opaque data containing event + * + * Sets the initial drop filter for secondary interface in an + * active-backup bond + */ +static void ice_lag_preset_drop_fltr(struct ice_lag *lag, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + + if (netdev != lag->netdev || lag->primary || !lag->need_fltr_cfg) + return; + + ice_lag_cfg_drop_fltr(lag, true); + lag->need_fltr_cfg = false; } /** @@ -1739,10 +2296,12 @@ static void ice_lag_process_event(struct work_struct *work) ice_lag_unregister(lag_work->lag, netdev); goto lag_cleanup; } - ice_lag_monitor_active(lag_work->lag, - &lag_work->info.bonding_info); ice_lag_cfg_pf_fltrs(lag_work->lag, &lag_work->info.bonding_info); + ice_lag_preset_drop_fltr(lag_work->lag, + &lag_work->info.bonding_info); + ice_lag_monitor_info(lag_work->lag, + &lag_work->info.bonding_info); } ice_lag_info_event(lag_work->lag, &lag_work->info.bonding_info); break; @@ -1993,7 +2552,8 @@ ice_lag_move_vf_nodes_tc_sync(struct ice_lag *lag, struct ice_hw *dest_hw, } if (ice_aq_cfg_lan_txq(hw, qbuf, qbuf_size, numq, hw->port_info->lport, - dest_hw->port_info->lport, NULL)) { + dest_hw->port_info->lport, + ICE_AQC_Q_CFG_TC_CHNG, NULL)) { dev_warn(dev, "Failure to configure queues for LAG reset rebuild\n"); goto sync_qerr; } @@ -2089,9 +2649,13 @@ int ice_init_lag(struct ice_pf *pf) lag->netdev = vsi->netdev; lag->role = ICE_LAG_NONE; lag->active_port = ICE_LAG_INVALID_PORT; + lag->port_bitmap = 0x0; lag->bonded = false; + lag->bond_aa = false; + lag->need_fltr_cfg = false; lag->upper_netdev = NULL; lag->notif_block.notifier_call = NULL; + memset(lag->sec_vf, 0, sizeof(lag->sec_vf)); err = ice_register_lag_handler(lag); if (err) { @@ -2109,6 +2673,11 @@ int ice_init_lag(struct ice_pf *pf) if (err) goto free_rcp_res; + err = ice_create_lag_recipe(&pf->hw, &lag->act_act_recipe, + ice_lport_rcp, 1); + if (err) + goto free_lport_res; + /* associate recipes to profiles */ for (n = 0; n < ICE_PROFID_IPV6_GTPU_IPV6_TCP_INNER; n++) { err = ice_aq_get_recipe_to_profile(&pf->hw, n, @@ -2118,7 +2687,8 @@ int ice_init_lag(struct ice_pf *pf) if (recipe_bits & BIT(ICE_SW_LKUP_DFLT)) { recipe_bits |= BIT(lag->pf_recipe) | - BIT(lag->lport_recipe); + BIT(lag->lport_recipe) | + BIT(lag->act_act_recipe); ice_aq_map_recipe_to_profile(&pf->hw, n, recipe_bits, NULL); } @@ -2129,9 +2699,13 @@ int ice_init_lag(struct ice_pf *pf) dev_dbg(dev, "INIT LAG complete\n"); return 0; +free_lport_res: + ice_free_hw_res(&pf->hw, ICE_AQC_RES_TYPE_RECIPE, 1, + &lag->lport_recipe); + free_rcp_res: ice_free_hw_res(&pf->hw, ICE_AQC_RES_TYPE_RECIPE, 1, - &pf->lag->pf_recipe); + &lag->pf_recipe); lag_error: kfree(lag); pf->lag = NULL; @@ -2216,11 +2790,15 @@ void ice_lag_rebuild(struct ice_pf *pf) ice_lag_move_vf_nodes_sync(prim_lag, &pf->hw); } - ice_lag_cfg_cp_fltr(lag, true); + if (!lag->bond_aa) { + ice_lag_cfg_lp_fltr(lag, true, true); + if (lag->pf_rx_rule_id) + if (ice_lag_cfg_dflt_fltr(lag, true)) + dev_err(ice_pf_to_dev(pf), "Error adding default VSI rule in rebuild\n"); + } else { + ice_lag_cfg_lp_fltr(lag, true, false); + } - if (lag->pf_rx_rule_id) - if (ice_lag_cfg_dflt_fltr(lag, true)) - dev_err(ice_pf_to_dev(pf), "Error adding default VSI rule in rebuild\n"); ice_clear_rdma_cap(pf); lag_rebuild_out: diff --git a/drivers/net/ethernet/intel/ice/ice_lag.h b/drivers/net/ethernet/intel/ice/ice_lag.h index 69347d9f986b..e2a0a782bdd7 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.h +++ b/drivers/net/ethernet/intel/ice/ice_lag.h @@ -14,7 +14,11 @@ enum ice_lag_role { ICE_LAG_UNSET }; -#define ICE_LAG_INVALID_PORT 0xFF +#define ICE_LAG_INVALID_PORT 0xFF +#define ICE_LAGP_IDX 0 +#define ICE_LAGS_IDX 1 +#define ICE_LAGP_M 0x1 +#define ICE_LAGS_M 0x2 #define ICE_LAG_RESET_RETRIES 5 #define ICE_SW_DEFAULT_PROFILE 0 @@ -41,12 +45,26 @@ struct ice_lag { u8 active_port; /* lport value for the current active port */ u8 bonded:1; /* currently bonded */ u8 primary:1; /* this is primary */ + u8 bond_aa:1; /* is this bond active-active */ + u8 need_fltr_cfg:1; /* fltrs for A/A bond still need to be make */ + u8 port_bitmap:2; /* bitmap of active ports */ + u8 bond_lport_pri; /* lport values for primary PF */ + u8 bond_lport_sec; /* lport values for secondary PF */ + + /* q_home keeps track of which interface the q is currently on */ + u8 q_home[ICE_MAX_SRIOV_VFS][ICE_MAX_RSS_QS_PER_VF]; + + /* placeholder VSI for hanging VF queues from on secondary interface */ + struct ice_vsi *sec_vf[ICE_MAX_SRIOV_VFS]; + u16 pf_recipe; u16 lport_recipe; + u16 act_act_recipe; u16 pf_rx_rule_id; u16 pf_tx_rule_id; u16 cp_rule_idx; u16 lport_rule_idx; + u16 act_act_rule_idx; u8 role; }; @@ -65,6 +83,7 @@ struct ice_lag_work { }; void ice_lag_move_new_vf_nodes(struct ice_vf *vf); +void ice_lag_aa_failover(struct ice_lag *lag, u8 dest, struct ice_pf *e_pf); int ice_init_lag(struct ice_pf *pf); void ice_deinit_lag(struct ice_pf *pf); void ice_lag_rebuild(struct ice_pf *pf); diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 2b07d5e48847..8d19efc1df72 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -296,6 +296,7 @@ struct ice_hw_common_caps { bool roce_lag; bool sriov_lag; + bool sriov_aa_lag; bool nvm_update_pending_nvm; bool nvm_update_pending_orom; diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h index dbe93f940ef0..ba62f703df43 100644 --- a/include/linux/net/intel/libie/adminq.h +++ b/include/linux/net/intel/libie/adminq.h @@ -194,6 +194,7 @@ LIBIE_CHECK_STRUCT_LEN(16, libie_aqc_list_caps); #define LIBIE_AQC_CAPS_FW_LAG_SUPPORT 0x0092 #define LIBIE_AQC_BIT_ROCEV2_LAG BIT(0) #define LIBIE_AQC_BIT_SRIOV_LAG BIT(1) +#define LIBIE_AQC_BIT_SRIOV_AA_LAG BIT(2) #define LIBIE_AQC_CAPS_FLEX10 0x00F1 #define LIBIE_AQC_CAPS_CEM 0x00F2 -- cgit v1.2.3 From 448f97fba9013ffa13f5dd82febd18836b189499 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Aug 2025 12:39:13 +0200 Subject: perf: Convert mmap() refcounts to refcount_t The recently fixed reference count leaks could have been detected by using refcount_t and refcount_t would have mitigated the potential overflow at least. Now that the code is properly structured, convert the mmap() related mmap_count variants over to refcount_t. No functional change intended. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250812104020.071507932@infradead.org --- include/linux/perf_event.h | 2 +- kernel/events/core.c | 40 ++++++++++++++++++++-------------------- kernel/events/internal.h | 4 ++-- kernel/events/ring_buffer.c | 2 +- 4 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ec9d96025683..bfbf9ea53f25 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,7 +859,7 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; - atomic_t mmap_count; + refcount_t mmap_count; struct perf_buffer *rb; struct list_head rb_entry; diff --git a/kernel/events/core.c b/kernel/events/core.c index f6211ab18503..ea357044d780 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3968,7 +3968,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx, */ static inline bool event_update_userpage(struct perf_event *event) { - if (likely(!atomic_read(&event->mmap_count))) + if (likely(!refcount_read(&event->mmap_count))) return false; perf_event_update_time(event); @@ -6704,11 +6704,11 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; mapped_f mapped = get_mapped(event, event_mapped); - atomic_inc(&event->mmap_count); - atomic_inc(&event->rb->mmap_count); + refcount_inc(&event->mmap_count); + refcount_inc(&event->rb->mmap_count); if (vma->vm_pgoff) - atomic_inc(&event->rb->aux_mmap_count); + refcount_inc(&event->rb->aux_mmap_count); if (mapped) mapped(event, vma->vm_mm); @@ -6743,7 +6743,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) * to avoid complications. */ if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && - atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { + refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) { /* * Stop all AUX events that are writing to this buffer, * so that we can free its AUX pages and corresponding PMU @@ -6763,10 +6763,10 @@ static void perf_mmap_close(struct vm_area_struct *vma) mutex_unlock(&rb->aux_mutex); } - if (atomic_dec_and_test(&rb->mmap_count)) + if (refcount_dec_and_test(&rb->mmap_count)) detach_rest = true; - if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) goto out_put; ring_buffer_attach(event, NULL); @@ -6992,19 +6992,19 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (data_page_nr(event->rb) != nr_pages) return -EINVAL; - if (atomic_inc_not_zero(&event->rb->mmap_count)) { + if (refcount_inc_not_zero(&event->rb->mmap_count)) { /* * Success -- managed to mmap() the same buffer * multiple times. */ perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + refcount_inc(&event->mmap_count); return 0; } /* * Raced against perf_mmap_close()'s - * atomic_dec_and_mutex_lock() remove the + * refcount_dec_and_mutex_lock() remove the * event and continue as if !event->rb */ ring_buffer_attach(event, NULL); @@ -7023,7 +7023,7 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, if (!rb) return -ENOMEM; - atomic_set(&rb->mmap_count, 1); + refcount_set(&rb->mmap_count, 1); rb->mmap_user = get_current_user(); rb->mmap_locked = extra; @@ -7034,7 +7034,7 @@ static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event, perf_event_update_userpage(event); perf_mmap_account(vma, user_extra, extra); - atomic_set(&event->mmap_count, 1); + refcount_set(&event->mmap_count, 1); return 0; } @@ -7081,15 +7081,15 @@ static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, if (!is_power_of_2(nr_pages)) return -EINVAL; - if (!atomic_inc_not_zero(&rb->mmap_count)) + if (!refcount_inc_not_zero(&rb->mmap_count)) return -EINVAL; if (rb_has_aux(rb)) { - atomic_inc(&rb->aux_mmap_count); + refcount_inc(&rb->aux_mmap_count); } else { if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) { - atomic_dec(&rb->mmap_count); + refcount_dec(&rb->mmap_count); return -EPERM; } @@ -7101,16 +7101,16 @@ static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event, ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, rb_flags); if (ret) { - atomic_dec(&rb->mmap_count); + refcount_dec(&rb->mmap_count); return ret; } - atomic_set(&rb->aux_mmap_count, 1); + refcount_set(&rb->aux_mmap_count, 1); rb->aux_mmap_locked = extra; } perf_mmap_account(vma, user_extra, extra); - atomic_inc(&event->mmap_count); + refcount_inc(&event->mmap_count); return 0; } @@ -13254,7 +13254,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex); set: /* Can't redirect output if we've got an active mmap() */ - if (atomic_read(&event->mmap_count)) + if (refcount_read(&event->mmap_count)) goto unlock; if (output_event) { @@ -13267,7 +13267,7 @@ set: goto unlock; /* did we race against perf_mmap_close() */ - if (!atomic_read(&rb->mmap_count)) { + if (!refcount_read(&rb->mmap_count)) { ring_buffer_put(rb); goto unlock; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 249288d82b8d..d9cc57083091 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -35,7 +35,7 @@ struct perf_buffer { spinlock_t event_lock; struct list_head event_list; - atomic_t mmap_count; + refcount_t mmap_count; unsigned long mmap_locked; struct user_struct *mmap_user; @@ -47,7 +47,7 @@ struct perf_buffer { unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; - atomic_t aux_mmap_count; + refcount_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); refcount_t aux_refcount; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index aa9a759e824f..20a905023736 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ - if (!atomic_read(&rb->aux_mmap_count)) + if (!refcount_read(&rb->aux_mmap_count)) goto err; if (!refcount_inc_not_zero(&rb->aux_refcount)) -- cgit v1.2.3 From 2335b3f56690f76ac34b972fcaef368bab1f76f2 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 16 Jun 2025 23:41:53 -0700 Subject: net/mlx5: mlx5_ifc, Add hardware definitions needed for adjacent vports Next patches will implement the discovery and creation of adjacent functions vports, this patch introduces the hardware structures definitions needed for the driver implementation. Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch Reviewed-by: Parav Pandit Reviewed-by: Jack Morgenstein Signed-off-by: Alexei Lazar --- include/linux/mlx5/mlx5_ifc.h | 133 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 129 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 8360d9011d4f..44d497272162 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -189,6 +189,9 @@ enum { MLX5_CMD_OP_QUERY_XRQ_ERROR_PARAMS = 0x727, MLX5_CMD_OP_RELEASE_XRQ_ERROR = 0x729, MLX5_CMD_OP_MODIFY_XRQ = 0x72a, + MLX5_CMD_OPCODE_QUERY_DELEGATED_VHCA = 0x732, + MLX5_CMD_OPCODE_CREATE_ESW_VPORT = 0x733, + MLX5_CMD_OPCODE_DESTROY_ESW_VPORT = 0x734, MLX5_CMD_OP_QUERY_ESW_FUNCTIONS = 0x740, MLX5_CMD_OP_QUERY_VPORT_STATE = 0x750, MLX5_CMD_OP_MODIFY_VPORT_STATE = 0x751, @@ -2207,7 +2210,19 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_440[0x8]; u8 max_num_eqs_24b[0x18]; - u8 reserved_at_460[0x3a0]; + + u8 reserved_at_460[0x160]; + + u8 query_adjacent_functions_id[0x1]; + u8 ingress_egress_esw_vport_connect[0x1]; + u8 function_id_type_vhca_id[0x1]; + u8 reserved_at_5c3[0xd]; + u8 delegate_vhca_management_profiles[0x10]; + + u8 delegated_vhca_max[0x10]; + u8 delegate_vhca_max[0x10]; + + u8 reserved_at_600[0x200]; }; enum mlx5_ifc_flow_destination_type { @@ -5159,7 +5174,9 @@ struct mlx5_ifc_set_hca_cap_in_bits { u8 other_function[0x1]; u8 ec_vf_function[0x1]; - u8 reserved_at_42[0xe]; + u8 reserved_at_42[0x1]; + u8 function_id_type[0x1]; + u8 reserved_at_44[0xc]; u8 function_id[0x10]; u8 reserved_at_60[0x20]; @@ -6357,7 +6374,9 @@ struct mlx5_ifc_query_hca_cap_in_bits { u8 other_function[0x1]; u8 ec_vf_function[0x1]; - u8 reserved_at_42[0xe]; + u8 reserved_at_42[0x1]; + u8 function_id_type[0x1]; + u8 reserved_at_44[0xc]; u8 function_id[0x10]; u8 reserved_at_60[0x20]; @@ -6983,6 +7002,28 @@ struct mlx5_ifc_query_esw_vport_context_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_destroy_esw_vport_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x20]; +}; + +struct mlx5_ifc_destroy_esw_vport_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 vport_num[0x10]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_modify_esw_vport_context_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7484,6 +7525,85 @@ struct mlx5_ifc_query_adapter_in_bits { u8 reserved_at_40[0x40]; }; +struct mlx5_ifc_function_vhca_rid_info_reg_bits { + u8 host_number[0x8]; + u8 host_pci_device_function[0x8]; + u8 host_pci_bus[0x8]; + u8 reserved_at_18[0x3]; + u8 pci_bus_assigned[0x1]; + u8 function_type[0x4]; + + u8 parent_pci_device_function[0x8]; + u8 parent_pci_bus[0x8]; + u8 vhca_id[0x10]; + + u8 reserved_at_40[0x10]; + u8 function_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_delegated_function_vhca_rid_info_bits { + struct mlx5_ifc_function_vhca_rid_info_reg_bits function_vhca_rid_info; + + u8 reserved_at_80[0x18]; + u8 manage_profile[0x8]; + + u8 reserved_at_a0[0x60]; +}; + +struct mlx5_ifc_query_delegated_vhca_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x10]; + u8 functions_count[0x10]; + + u8 reserved_at_80[0x80]; + + struct mlx5_ifc_delegated_function_vhca_rid_info_bits + delegated_function_vhca_rid_info[]; +}; + +struct mlx5_ifc_query_delegated_vhca_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_create_esw_vport_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x10]; + u8 vport_num[0x10]; +}; + +struct mlx5_ifc_create_esw_vport_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x10]; + u8 managed_vhca_id[0x10]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_qp_2rst_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7611,7 +7731,12 @@ struct mlx5_ifc_modify_vport_state_in_bits { u8 reserved_at_41[0xf]; u8 vport_number[0x10]; - u8 reserved_at_60[0x18]; + u8 reserved_at_60[0x10]; + u8 ingress_connect[0x1]; + u8 egress_connect[0x1]; + u8 ingress_connect_valid[0x1]; + u8 egress_connect_valid[0x1]; + u8 reserved_at_74[0x4]; u8 admin_state[0x4]; u8 reserved_at_7c[0x4]; }; -- cgit v1.2.3 From 40653f280b2640e5caa94eeedee43e0f1df97704 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 16 Jun 2025 17:28:20 -0700 Subject: {rdma,net}/mlx5: export mlx5_vport_get_vhca_id vhca id is already cached in the vport structure no need to query on every mlx5 layer, use the mlx5_vport_get_vhca_id, where possible. Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch Reviewed-by: Parav Pandit Signed-off-by: Alexei Lazar Reviewed-by: Feng Liu Reviewed-by: Tariq Toukan --- drivers/infiniband/hw/mlx5/std_types.c | 27 ++++------------------ .../mellanox/mlx5/core/diag/reporter_vnic.c | 2 ++ .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 2 -- .../ethernet/mellanox/mlx5/core/steering/hws/cmd.c | 16 +++++++++---- .../mellanox/mlx5/core/steering/sws/dr_cmd.c | 16 +++++++++---- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 5 +++- include/linux/mlx5/vport.h | 2 ++ 7 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index bdb568411091..2fcf553044e1 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -83,33 +83,14 @@ static int fill_vport_icm_addr(struct mlx5_core_dev *mdev, u16 vport, static int fill_vport_vhca_id(struct mlx5_core_dev *mdev, u16 vport, struct mlx5_ib_uapi_query_port *info) { - size_t out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); - u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; - void *out; - int err; - - out = kzalloc(out_sz, GFP_KERNEL); - if (!out) - return -ENOMEM; + int err = mlx5_vport_get_vhca_id(mdev, vport, &info->vport_vhca_id); - MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); - MLX5_SET(query_hca_cap_in, in, other_function, true); - MLX5_SET(query_hca_cap_in, in, function_id, vport); - MLX5_SET(query_hca_cap_in, in, op_mod, - MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | - HCA_CAP_OPMOD_GET_CUR); - - err = mlx5_cmd_exec(mdev, in, sizeof(in), out, out_sz); if (err) - goto out; - - info->vport_vhca_id = MLX5_GET(query_hca_cap_out, out, - capability.cmd_hca_cap.vhca_id); + return err; info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID; -out: - kfree(out); - return err; + + return 0; } static int fill_multiport_info(struct mlx5_ib_dev *dev, u32 port_num, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c index 86253a89c24c..32bb769f1829 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/reporter_vnic.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */ +#include + #include "reporter_vnic.h" #include "en_stats.h" #include "devlink.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index b6d53db27cd5..81857c6f6bf7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -447,8 +447,6 @@ int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap #define mlx5_vport_get_other_func_general_cap(dev, vport, out) \ mlx5_vport_get_other_func_cap(dev, vport, out, MLX5_CAP_GENERAL) -int mlx5_vport_get_vhca_id(struct mlx5_core_dev *dev, u16 vport, u16 *vhca_id); - static inline u32 mlx5_sriov_get_vf_total_msix(struct pci_dev *pdev) { struct mlx5_core_dev *dev = pci_get_drvdata(pdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index 9c83753e4592..d447574d86fe 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -1199,22 +1199,28 @@ out: int mlx5hws_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_function, u16 vport_number, u16 *gvmi) { - bool ec_vf_func = other_function ? mlx5_core_is_ec_vf_vport(mdev, vport_number) : false; u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; int out_size; void *out; int err; + if (other_function) { + err = mlx5_vport_get_vhca_id(mdev, vport_number, gvmi); + if (!err) + return 0; + + mlx5_core_err(mdev, "Failed to get vport vhca id for vport %d\n", + vport_number); + return err; + } + + /* get vhca_id for `this` function */ out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); out = kzalloc(out_size, GFP_KERNEL); if (!out) return -ENOMEM; MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); - MLX5_SET(query_hca_cap_in, in, other_function, other_function); - MLX5_SET(query_hca_cap_in, in, function_id, - mlx5_vport_to_func_id(mdev, vport_number, ec_vf_func)); - MLX5_SET(query_hca_cap_in, in, ec_vf_function, ec_vf_func); MLX5_SET(query_hca_cap_in, in, op_mod, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | HCA_CAP_OPMOD_GET_CUR); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c index baefb9a3fa05..bf99b933fd14 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_cmd.c @@ -2,6 +2,7 @@ /* Copyright (c) 2019 Mellanox Technologies. */ #include "dr_types.h" +#include "eswitch.h" int mlx5dr_cmd_query_esw_vport_context(struct mlx5_core_dev *mdev, bool other_vport, @@ -34,21 +35,28 @@ int mlx5dr_cmd_query_esw_vport_context(struct mlx5_core_dev *mdev, int mlx5dr_cmd_query_gvmi(struct mlx5_core_dev *mdev, bool other_vport, u16 vport_number, u16 *gvmi) { - bool ec_vf_func = other_vport ? mlx5_core_is_ec_vf_vport(mdev, vport_number) : false; u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {}; int out_size; void *out; int err; + if (other_vport) { + err = mlx5_vport_get_vhca_id(mdev, vport_number, gvmi); + if (!err) + return 0; + + mlx5_core_err(mdev, "Failed to get vport vhca id for vport %d\n", + vport_number); + return err; + } + + /* get vhca_id for `this` function */ out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out); out = kzalloc(out_size, GFP_KERNEL); if (!out) return -ENOMEM; MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); - MLX5_SET(query_hca_cap_in, in, other_function, other_vport); - MLX5_SET(query_hca_cap_in, in, function_id, mlx5_vport_to_func_id(mdev, vport_number, ec_vf_func)); - MLX5_SET(query_hca_cap_in, in, ec_vf_function, ec_vf_func); MLX5_SET(query_hca_cap_in, in, op_mod, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 | HCA_CAP_OPMOD_GET_CUR); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 231bedc6a252..2ed2e530b07d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -1239,7 +1239,9 @@ int mlx5_vport_get_vhca_id(struct mlx5_core_dev *dev, u16 vport, u16 *vhca_id) void *hca_caps; int err; - *vhca_id = 0; + /* try get vhca_id via eswitch */ + if (mlx5_esw_vport_vhca_id(dev->priv.eswitch, vport, vhca_id)) + return 0; query_ctx = kzalloc(query_out_sz, GFP_KERNEL); if (!query_ctx) @@ -1256,6 +1258,7 @@ out_free: kfree(query_ctx); return err; } +EXPORT_SYMBOL_GPL(mlx5_vport_get_vhca_id); int mlx5_vport_set_other_func_cap(struct mlx5_core_dev *dev, const void *hca_cap, u16 vport, u16 opmod) diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index c36cc6d82926..c87b9507cfa1 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -135,4 +135,6 @@ int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev); u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev); int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 vport, void *out, u16 opmod); +int mlx5_vport_get_vhca_id(struct mlx5_core_dev *dev, u16 vport, u16 *vhca_id); + #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From 33cfb80d1910b41d1a25cef89b159c945aff0f24 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 21 Jul 2025 14:13:10 +0000 Subject: crypto: ccp - Add support for SNP_FEATURE_INFO command The FEATURE_INFO command provides hypervisors with a programmatic means to learn about the supported features of the currently loaded firmware. This command mimics the CPUID instruction relative to sub-leaf input and the four unsigned integer output values. To obtain information regarding the features present in the currently loaded SEV firmware, use the SNP_FEATURE_INFO command. Cache the SNP platform status and feature information from CPUID 0x8000_0024 in the sev_device structure. If SNP is enabled, utilize this cached SNP platform status for the API major, minor and build version. Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 72 ++++++++++++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/sev-dev.h | 3 ++ include/linux/psp-sev.h | 29 ++++++++++++++++++ 3 files changed, 104 insertions(+) (limited to 'include/linux') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 528013be1c0a..a3941254d61f 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -233,6 +233,7 @@ static int sev_cmd_buffer_len(int cmd) case SEV_CMD_SNP_GUEST_REQUEST: return sizeof(struct sev_data_snp_guest_request); case SEV_CMD_SNP_CONFIG: return sizeof(struct sev_user_data_snp_config); case SEV_CMD_SNP_COMMIT: return sizeof(struct sev_data_snp_commit); + case SEV_CMD_SNP_FEATURE_INFO: return sizeof(struct sev_data_snp_feature_info); default: return 0; } @@ -1073,6 +1074,67 @@ static void snp_set_hsave_pa(void *arg) wrmsrq(MSR_VM_HSAVE_PA, 0); } +static int snp_get_platform_data(struct sev_device *sev, int *error) +{ + struct sev_data_snp_feature_info snp_feat_info; + struct snp_feature_info *feat_info; + struct sev_data_snp_addr buf; + struct page *page; + int rc; + + /* + * This function is expected to be called before SNP is + * initialized. + */ + if (sev->snp_initialized) + return -EINVAL; + + buf.address = __psp_pa(&sev->snp_plat_status); + rc = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, error); + if (rc) { + dev_err(sev->dev, "SNP PLATFORM_STATUS command failed, ret = %d, error = %#x\n", + rc, *error); + return rc; + } + + sev->api_major = sev->snp_plat_status.api_major; + sev->api_minor = sev->snp_plat_status.api_minor; + sev->build = sev->snp_plat_status.build_id; + + /* + * Do feature discovery of the currently loaded firmware, + * and cache feature information from CPUID 0x8000_0024, + * sub-function 0. + */ + if (!sev->snp_plat_status.feature_info) + return 0; + + /* + * Use dynamically allocated structure for the SNP_FEATURE_INFO + * command to ensure structure is 8-byte aligned, and does not + * cross a page boundary. + */ + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + feat_info = page_address(page); + snp_feat_info.length = sizeof(snp_feat_info); + snp_feat_info.ecx_in = 0; + snp_feat_info.feature_info_paddr = __psp_pa(feat_info); + + rc = sev_do_cmd(SEV_CMD_SNP_FEATURE_INFO, &snp_feat_info, error); + if (!rc) + sev->snp_feat_info_0 = *feat_info; + else + dev_err(sev->dev, "SNP FEATURE_INFO command failed, ret = %d, error = %#x\n", + rc, *error); + + __free_page(page); + + return rc; +} + static int snp_filter_reserved_mem_regions(struct resource *rs, void *arg) { struct sev_data_range_list *range_list = arg; @@ -1599,6 +1661,16 @@ static int sev_get_api_version(void) struct sev_user_data_status status; int error = 0, ret; + /* + * Cache SNP platform status and SNP feature information + * if SNP is available. + */ + if (cc_platform_has(CC_ATTR_HOST_SEV_SNP)) { + ret = snp_get_platform_data(sev, &error); + if (ret) + return 1; + } + ret = sev_platform_status(&status, &error); if (ret) { dev_err(sev->dev, diff --git a/drivers/crypto/ccp/sev-dev.h b/drivers/crypto/ccp/sev-dev.h index 24dd8ff8afaa..5aed2595c9ae 100644 --- a/drivers/crypto/ccp/sev-dev.h +++ b/drivers/crypto/ccp/sev-dev.h @@ -58,6 +58,9 @@ struct sev_device { bool snp_initialized; struct sev_user_data_status sev_plat_status; + + struct sev_user_data_snp_status snp_plat_status; + struct snp_feature_info snp_feat_info_0; }; int sev_dev_init(struct psp_device *psp); diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 0f5f94137f6d..5fb6ae0f51cc 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -107,6 +107,7 @@ enum sev_cmd { SEV_CMD_SNP_DOWNLOAD_FIRMWARE_EX = 0x0CA, SEV_CMD_SNP_COMMIT = 0x0CB, SEV_CMD_SNP_VLEK_LOAD = 0x0CD, + SEV_CMD_SNP_FEATURE_INFO = 0x0CE, SEV_CMD_MAX, }; @@ -814,6 +815,34 @@ struct sev_data_snp_commit { u32 len; } __packed; +/** + * struct sev_data_snp_feature_info - SEV_SNP_FEATURE_INFO structure + * + * @length: len of the command buffer read by the PSP + * @ecx_in: subfunction index + * @feature_info_paddr : System Physical Address of the FEATURE_INFO structure + */ +struct sev_data_snp_feature_info { + u32 length; + u32 ecx_in; + u64 feature_info_paddr; +} __packed; + +/** + * struct feature_info - FEATURE_INFO structure + * + * @eax: output of SNP_FEATURE_INFO command + * @ebx: output of SNP_FEATURE_INFO command + * @ecx: output of SNP_FEATURE_INFO command + * #edx: output of SNP_FEATURE_INFO command + */ +struct snp_feature_info { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; +} __packed; + #ifdef CONFIG_CRYPTO_DEV_SP_PSP /** -- cgit v1.2.3 From 45d59bd4a3e0f0475b3646e8b9936d34794e503d Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 21 Jul 2025 14:13:27 +0000 Subject: crypto: ccp - Introduce new API interface to indicate SEV-SNP Ciphertext hiding feature Implement an API that checks the overall feature support for SEV-SNP ciphertext hiding. This API verifies both the support of the SEV firmware for the feature and its enablement in the platform's BIOS. Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 21 +++++++++++++++++++++ include/linux/psp-sev.h | 5 +++++ 2 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index a3941254d61f..58c9e040e9ac 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -1074,6 +1074,27 @@ static void snp_set_hsave_pa(void *arg) wrmsrq(MSR_VM_HSAVE_PA, 0); } +bool sev_is_snp_ciphertext_hiding_supported(void) +{ + struct psp_device *psp = psp_master; + struct sev_device *sev; + + if (!psp || !psp->sev_data) + return false; + + sev = psp->sev_data; + + /* + * Feature information indicates if CipherTextHiding feature is + * supported by the SEV firmware and additionally platform status + * indicates if CipherTextHiding feature is enabled in the + * Platform BIOS. + */ + return ((sev->snp_feat_info_0.ecx & SNP_CIPHER_TEXT_HIDING_SUPPORTED) && + sev->snp_plat_status.ciphertext_hiding_cap); +} +EXPORT_SYMBOL_GPL(sev_is_snp_ciphertext_hiding_supported); + static int snp_get_platform_data(struct sev_device *sev, int *error) { struct sev_data_snp_feature_info snp_feat_info; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 5fb6ae0f51cc..d83185b4268b 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -843,6 +843,8 @@ struct snp_feature_info { u32 edx; } __packed; +#define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) + #ifdef CONFIG_CRYPTO_DEV_SP_PSP /** @@ -986,6 +988,7 @@ void *psp_copy_user_blob(u64 uaddr, u32 len); void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); void sev_platform_shutdown(void); +bool sev_is_snp_ciphertext_hiding_supported(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ @@ -1022,6 +1025,8 @@ static inline void snp_free_firmware_page(void *addr) { } static inline void sev_platform_shutdown(void) { } +static inline bool sev_is_snp_ciphertext_hiding_supported(void) { return false; } + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ #endif /* __PSP_SEV_H__ */ -- cgit v1.2.3 From c9760b0fca6bfa250c02e14bfe81c542f3626a72 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 21 Jul 2025 14:13:55 +0000 Subject: crypto: ccp - Add support to enable CipherTextHiding on SNP_INIT_EX To enable ciphertext hiding, it must be specified in the SNP_INIT_EX command as part of SNP initialization. Modify the sev_platform_init_args structure, which is used as input to sev_platform_init(), to include a field that, when non-zero, indicates that ciphertext hiding should be enabled and specifies the maximum ASID that can be used for an SEV-SNP guest. Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 12 +++++++++--- include/linux/psp-sev.h | 10 ++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 58c9e040e9ac..334405461657 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -1186,7 +1186,7 @@ static int snp_filter_reserved_mem_regions(struct resource *rs, void *arg) return 0; } -static int __sev_snp_init_locked(int *error) +static int __sev_snp_init_locked(int *error, unsigned int max_snp_asid) { struct psp_device *psp = psp_master; struct sev_data_snp_init_ex data; @@ -1247,6 +1247,12 @@ static int __sev_snp_init_locked(int *error) } memset(&data, 0, sizeof(data)); + + if (max_snp_asid) { + data.ciphertext_hiding_en = 1; + data.max_snp_asid = max_snp_asid; + } + data.init_rmp = 1; data.list_paddr_en = 1; data.list_paddr = __psp_pa(snp_range_list); @@ -1433,7 +1439,7 @@ static int _sev_platform_init_locked(struct sev_platform_init_args *args) if (sev->sev_plat_status.state == SEV_STATE_INIT) return 0; - rc = __sev_snp_init_locked(&args->error); + rc = __sev_snp_init_locked(&args->error, args->max_snp_asid); if (rc && rc != -ENODEV) return rc; @@ -1516,7 +1522,7 @@ static int snp_move_to_init_state(struct sev_issue_cmd *argp, bool *shutdown_req { int error, rc; - rc = __sev_snp_init_locked(&error); + rc = __sev_snp_init_locked(&error, 0); if (rc) { argp->error = SEV_RET_INVALID_PLATFORM_STATE; return rc; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index d83185b4268b..e0dbcb4b4fd9 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -748,10 +748,13 @@ struct sev_data_snp_guest_request { struct sev_data_snp_init_ex { u32 init_rmp:1; u32 list_paddr_en:1; - u32 rsvd:30; + u32 rapl_dis:1; + u32 ciphertext_hiding_en:1; + u32 rsvd:28; u32 rsvd1; u64 list_paddr; - u8 rsvd2[48]; + u16 max_snp_asid; + u8 rsvd2[46]; } __packed; /** @@ -800,10 +803,13 @@ struct sev_data_snp_shutdown_ex { * @probe: True if this is being called as part of CCP module probe, which * will defer SEV_INIT/SEV_INIT_EX firmware initialization until needed * unless psp_init_on_probe module param is set + * @max_snp_asid: When non-zero, enable ciphertext hiding and specify the + * maximum ASID that can be used for an SEV-SNP guest. */ struct sev_platform_init_args { int error; bool probe; + unsigned int max_snp_asid; }; /** -- cgit v1.2.3 From b76c739c3d11d1dacc8efe7fa873bee28ac991f1 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 22 Jul 2025 16:52:38 -0500 Subject: iio: fix iio_push_to_buffers_with_ts() typo Replace iio_push_to_buffer_with_ts() with iio_push_to_buffers_with_ts() in some documentation comments in iio.h. The latter is the correct name of the function, the former doesn't exist. Signed-off-by: David Lechner Link: https://patch.msgid.link/20250722-iio-fix-iio_push_to_buffer_with_ts-typo-v1-1-6ac9efb856d3@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index d11668f14a3e..2f5560646ee4 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -779,7 +779,7 @@ static inline void *iio_device_get_drvdata(const struct iio_dev *indio_dev) * them safe for use with non-coherent DMA. * * A number of drivers also use this on buffers that include a 64-bit timestamp - * that is used with iio_push_to_buffer_with_ts(). Therefore, in the case where + * that is used with iio_push_to_buffers_with_ts(). Therefore, in the case where * DMA alignment is not sufficient for proper timestamp alignment, we align to * 8 bytes instead. */ @@ -794,7 +794,7 @@ static inline void *iio_device_get_drvdata(const struct iio_dev *indio_dev) * @name: identifier name of the buffer * @count: number of elements in the buffer * - * Declares a buffer that is safe to use with iio_push_to_buffer_with_ts(). In + * Declares a buffer that is safe to use with iio_push_to_buffers_with_ts(). In * addition to allocating enough space for @count elements of @type, it also * allocates space for a s64 timestamp at the end of the buffer and ensures * proper alignment of the timestamp. -- cgit v1.2.3 From 4847d1187402a5027d9a04393f12d52a5a1d7f98 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 14 Aug 2025 09:24:41 +0200 Subject: console: introduce console_lock guard()s Having this, guards like these work: guard(console_lock)(); or scoped_guard(console_lock) { ... } See e.g. "vc_screen: use guard()s" later in this series. Signed-off-by: "Jiri Slaby (SUSE)" Link: https://lore.kernel.org/r/20250814072456.182853-2-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/console.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 8f10d0a85bb4..031a58dc2b91 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -666,6 +666,8 @@ void vcs_remove_sysfs(int index); */ extern atomic_t ignore_console_lock_warning; +DEFINE_LOCK_GUARD_0(console_lock, console_lock(), console_unlock()); + extern void console_init(void); /* For deferred console takeover */ -- cgit v1.2.3 From e8398b8aed50382c21fcec77e80a5314e7c45c25 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 14 Aug 2025 09:24:42 +0200 Subject: tty: introduce tty_port_tty guard() Having this, guards like these work: scoped_guard(tty_port_tty, port) tty_wakeup(scoped_tty()); See e.g. "tty_port: use scoped_guard()" later in this series. The definitions depend on CONFIG_TTY. It's due to tty_kref_put(). On !CONFIG_TTY, it is an inline and its declaration would conflict. The guards are not needed in that case, of course. Signed-off-by: "Jiri Slaby (SUSE)" Link: https://lore.kernel.org/r/20250814072456.182853-3-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_port.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 332ddb93603e..660c254f1efe 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -270,4 +270,18 @@ static inline void tty_port_tty_vhangup(struct tty_port *port) __tty_port_tty_hangup(port, false, false); } +#ifdef CONFIG_TTY +void tty_kref_put(struct tty_struct *tty); +__DEFINE_CLASS_IS_CONDITIONAL(tty_port_tty, true); +__DEFINE_UNLOCK_GUARD(tty_port_tty, struct tty_struct, tty_kref_put(_T->lock)); +static inline class_tty_port_tty_t class_tty_port_tty_constructor(struct tty_port *tport) +{ + class_tty_port_tty_t _t = { + .lock = tty_port_tty_get(tport), + }; + return _t; +} +#define scoped_tty() ((struct tty_struct *)(__guard_ptr(tty_port_tty)(&scope))) +#endif + #endif -- cgit v1.2.3 From 0fd60b689b0dacce659253ec15cb3d3bf660e30b Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 14 Aug 2025 09:24:43 +0200 Subject: serial: introduce uart_port_lock() guard()s Having this, guards like these work: guard(uart_port_lock_irq)(&up->port); or scoped_guard(uart_port_lock_irqsave, port) { ... } See e.g. "serial: 8250: use guard()s" later in this series. Signed-off-by: "Jiri Slaby (SUSE)" Link: https://lore.kernel.org/r/20250814072456.182853-4-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 84b4648ead7e..666430b47899 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -788,6 +788,19 @@ static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned lo spin_unlock_irqrestore(&up->lock, flags); } +DEFINE_GUARD(uart_port_lock, struct uart_port *, uart_port_lock(_T), uart_port_unlock(_T)); +DEFINE_GUARD_COND(uart_port_lock, _try, uart_port_trylock(_T)); + +DEFINE_GUARD(uart_port_lock_irq, struct uart_port *, uart_port_lock_irq(_T), + uart_port_unlock_irq(_T)); + +DEFINE_LOCK_GUARD_1(uart_port_lock_irqsave, struct uart_port, + uart_port_lock_irqsave(_T->lock, &_T->flags), + uart_port_unlock_irqrestore(_T->lock, _T->flags), + unsigned long flags); +DEFINE_LOCK_GUARD_1_COND(uart_port_lock_irqsave, _try, + uart_port_trylock_irqsave(_T->lock, &_T->flags)); + static inline int serial_port_in(struct uart_port *up, int offset) { return up->serial_in(up, offset); -- cgit v1.2.3 From 292cb391479d50f4379a0abab34324de92c82a92 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 13 Aug 2025 23:03:52 -0700 Subject: software node: Constify node_group in registration functions The software_node_register_node_group() and software_node_unregister_node_group() functions take in essence an array of pointers to software_node structs. Since the functions do not modify the array declare the argument as constant, so that static arrays can be declared as const and annotated as __initconst. Signed-off-by: Dmitry Torokhov Link: https://lore.kernel.org/r/2zny5grbgtwbplynxffxg6dkgjgqf45aigwmgxio5stesdr3wi@gf2zamk5amic Signed-off-by: Greg Kroah-Hartman --- drivers/base/swnode.c | 5 ++--- include/linux/property.h | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index deda7f35a059..be1e9e61a7bf 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -844,7 +844,7 @@ swnode_register(const struct software_node *node, struct swnode *parent, * of this function or by ordering the array such that parent comes before * child. */ -int software_node_register_node_group(const struct software_node **node_group) +int software_node_register_node_group(const struct software_node * const *node_group) { unsigned int i; int ret; @@ -877,8 +877,7 @@ EXPORT_SYMBOL_GPL(software_node_register_node_group); * remove the nodes individually, in the correct order (child before * parent). */ -void software_node_unregister_node_group( - const struct software_node **node_group) +void software_node_unregister_node_group(const struct software_node * const *node_group) { unsigned int i = 0; diff --git a/include/linux/property.h b/include/linux/property.h index 82f0cb3abd1e..d1e80b3c9918 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -574,8 +574,8 @@ const struct software_node * software_node_find_by_name(const struct software_node *parent, const char *name); -int software_node_register_node_group(const struct software_node **node_group); -void software_node_unregister_node_group(const struct software_node **node_group); +int software_node_register_node_group(const struct software_node * const *node_group); +void software_node_unregister_node_group(const struct software_node * const *node_group); int software_node_register(const struct software_node *node); void software_node_unregister(const struct software_node *node); -- cgit v1.2.3 From 894af4a1cde61c3401f237184fb770f72ff12df8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 12 Apr 2025 13:56:01 +0200 Subject: objtool: Validate kCFI calls Validate that all indirect calls adhere to kCFI rules. Notably doing nocfi indirect call to a cfi function is broken. Apparently some Rust 'core' code violates this and explodes when ran with FineIBT. All the ANNOTATE_NOCFI_SYM sites are prime targets for attackers. - runtime EFI is especially henous because it also needs to disable IBT. Basically calling unknown code without CFI protection at runtime is a massice security issue. - Kexec image handover; if you can exploit this, you get to keep it :-) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Acked-by: Sean Christopherson Link: https://lkml.kernel.org/r/20250714103441.496787279@infradead.org --- arch/x86/kernel/machine_kexec_64.c | 4 ++++ arch/x86/kvm/vmx/vmenter.S | 4 ++++ arch/x86/platform/efi/efi_stub_64.S | 4 ++++ drivers/misc/lkdtm/perms.c | 5 +++++ include/linux/objtool.h | 10 +++++++++ include/linux/objtool_types.h | 1 + tools/include/linux/objtool_types.h | 1 + tools/objtool/check.c | 42 +++++++++++++++++++++++++++++++++++++ tools/objtool/include/objtool/elf.h | 1 + 9 files changed, 72 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 697fb99406e6..8593760c255a 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -453,6 +453,10 @@ void __nocfi machine_kexec(struct kimage *image) __ftrace_enabled_restore(save_ftrace_enabled); } +/* + * Handover to the next kernel, no CFI concern. + */ +ANNOTATE_NOCFI_SYM(machine_kexec); /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 0a6cf5bff2aa..bc255d709d8a 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -361,6 +361,10 @@ SYM_FUNC_END(vmread_error_trampoline) .section .text, "ax" +#ifndef CONFIG_X86_FRED + SYM_FUNC_START(vmx_do_interrupt_irqoff) VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1 SYM_FUNC_END(vmx_do_interrupt_irqoff) + +#endif diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 2206b8bc47b8..f0a5fba0717e 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -11,6 +11,10 @@ #include SYM_FUNC_START(__efi_call) + /* + * The EFI code doesn't have any CFI, annotate away the CFI violation. + */ + ANNOTATE_NOCFI_SYM pushq %rbp movq %rsp, %rbp and $~0xf, %rsp diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c index 6c24426104ba..e1f5e9abb301 100644 --- a/drivers/misc/lkdtm/perms.c +++ b/drivers/misc/lkdtm/perms.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,10 @@ static noinline __nocfi void execute_location(void *dst, bool write) func(); pr_err("FAIL: func returned\n"); } +/* + * Explicitly doing the wrong thing for testing. + */ +ANNOTATE_NOCFI_SYM(execute_location); static void execute_user_location(void *dst) { diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 366ad004d794..46ebaa46e6c5 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -184,6 +184,15 @@ * WARN using UD2. */ #define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) +/* + * This should not be used; it annotates away CFI violations. There are a few + * valid use cases like kexec handover to the next kernel image, and there is + * no security concern there. + * + * There are also a few real issues annotated away, like EFI because we can't + * control the EFI code. + */ +#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI)) #else #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR @@ -194,6 +203,7 @@ #define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL #define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN #define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE +#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI #endif #if defined(CONFIG_NOINSTR_VALIDATION) && \ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index df5d9fa84dba..aceac94632c8 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -65,5 +65,6 @@ struct unwind_hint { #define ANNOTYPE_IGNORE_ALTS 6 #define ANNOTYPE_INTRA_FUNCTION_CALL 7 #define ANNOTYPE_REACHABLE 8 +#define ANNOTYPE_NOCFI 9 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/include/linux/objtool_types.h b/tools/include/linux/objtool_types.h index df5d9fa84dba..aceac94632c8 100644 --- a/tools/include/linux/objtool_types.h +++ b/tools/include/linux/objtool_types.h @@ -65,5 +65,6 @@ struct unwind_hint { #define ANNOTYPE_IGNORE_ALTS 6 #define ANNOTYPE_INTRA_FUNCTION_CALL 7 #define ANNOTYPE_REACHABLE 8 +#define ANNOTYPE_NOCFI 9 #endif /* _LINUX_OBJTOOL_TYPES_H */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index d14f20ef1db1..79eab61cd944 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2392,6 +2392,8 @@ static int __annotate_ifc(struct objtool_file *file, int type, struct instructio static int __annotate_late(struct objtool_file *file, int type, struct instruction *insn) { + struct symbol *sym; + switch (type) { case ANNOTYPE_NOENDBR: /* early */ @@ -2433,6 +2435,15 @@ static int __annotate_late(struct objtool_file *file, int type, struct instructi insn->dead_end = false; break; + case ANNOTYPE_NOCFI: + sym = insn->sym; + if (!sym) { + ERROR_INSN(insn, "dodgy NOCFI annotation"); + return -1; + } + insn->sym->nocfi = 1; + break; + default: ERROR_INSN(insn, "Unknown annotation type: %d", type); return -1; @@ -4002,6 +4013,37 @@ static int validate_retpoline(struct objtool_file *file) warnings++; } + if (!opts.cfi) + return warnings; + + /* + * kCFI call sites look like: + * + * movl $(-0x12345678), %r10d + * addl -4(%r11), %r10d + * jz 1f + * ud2 + * 1: cs call __x86_indirect_thunk_r11 + * + * Verify all indirect calls are kCFI adorned by checking for the + * UD2. Notably, doing __nocfi calls to regular (cfi) functions is + * broken. + */ + list_for_each_entry(insn, &file->retpoline_call_list, call_node) { + struct symbol *sym = insn->sym; + + if (sym && (sym->type == STT_NOTYPE || + sym->type == STT_FUNC) && !sym->nocfi) { + struct instruction *prev = + prev_insn_same_sym(file, insn); + + if (!prev || prev->type != INSN_BUG) { + WARN_INSN(insn, "no-cfi indirect call!"); + warnings++; + } + } + } + return warnings; } diff --git a/tools/objtool/include/objtool/elf.h b/tools/objtool/include/objtool/elf.h index 0a2fa3ac0079..df8434d3b744 100644 --- a/tools/objtool/include/objtool/elf.h +++ b/tools/objtool/include/objtool/elf.h @@ -70,6 +70,7 @@ struct symbol { u8 local_label : 1; u8 frame_pointer : 1; u8 ignore : 1; + u8 nocfi : 1; struct list_head pv_target; struct reloc *relocs; struct section *group_sec; -- cgit v1.2.3 From 89d912e494f786e79f69ed9d567a8842c71dbb03 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:27 +0200 Subject: bpf: Add dynptr type for skb metadata Add a dynptr type, similar to skb dynptr, but for the skb metadata access. The dynptr provides an alternative to __sk_buff->data_meta for accessing the custom metadata area allocated using the bpf_xdp_adjust_meta() helper. More importantly, it abstracts away the fact where the storage for the custom metadata lives, which opens up the way to persist the metadata by relocating it as the skb travels through the network stack layers. Writes to skb metadata invalidate any existing skb payload and metadata slices. While this is more restrictive that needed at the moment, it leaves the door open to reallocating the metadata on writes, and should be only a minor inconvenience to the users. Only the program types which can access __sk_buff->data_meta today are allowed to create a dynptr for skb metadata at the moment. We need to modify the network stack to persist the metadata across layers before opening up access to other BPF hooks. Once more BPF hooks gain access to skb_meta dynptr, we will also need to add a read-only variant of the helper similar to bpf_dynptr_from_skb_rdonly. skb_meta dynptr ops are stubbed out and implemented by subsequent changes. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Reviewed-by: Jesse Brandeburg Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-1-8a39e636e0fb@cloudflare.com --- include/linux/bpf.h | 7 ++++++- kernel/bpf/helpers.c | 7 +++++++ kernel/bpf/log.c | 2 ++ kernel/bpf/verifier.c | 15 +++++++++++++-- net/core/filter.c | 41 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 69 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc700925b802..ec527b476dba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -767,12 +767,15 @@ enum bpf_type_flag { */ MEM_WRITE = BIT(18 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ + DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ - | DYNPTR_TYPE_XDP) + | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1358,6 +1361,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_SKB, /* Underlying data is a xdp_buff */ BPF_DYNPTR_TYPE_XDP, + /* Points to skb_metadata_end()-skb_metadata_len() */ + BPF_DYNPTR_TYPE_SKB_META, }; int bpf_dynptr_check_size(u32 size); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68..9552b32208c5 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1780,6 +1780,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); + case BPF_DYNPTR_TYPE_SKB_META: + return -EOPNOTSUPP; /* not implemented */ default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1836,6 +1838,8 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, if (flags) return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); + case BPF_DYNPTR_TYPE_SKB_META: + return -EOPNOTSUPP; /* not implemented */ default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -1882,6 +1886,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3 return (unsigned long)(ptr->data + ptr->offset + offset); case BPF_DYNPTR_TYPE_SKB: case BPF_DYNPTR_TYPE_XDP: + case BPF_DYNPTR_TYPE_SKB_META: /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */ return 0; default: @@ -2710,6 +2715,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false); return buffer__opt; } + case BPF_DYNPTR_TYPE_SKB_META: + return NULL; /* not implemented */ default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 38050f4ee400..e4983c1303e7 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -498,6 +498,8 @@ const char *dynptr_type_str(enum bpf_dynptr_type type) return "skb"; case BPF_DYNPTR_TYPE_XDP: return "xdp"; + case BPF_DYNPTR_TYPE_SKB_META: + return "skb_meta"; case BPF_DYNPTR_TYPE_INVALID: return ""; default: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af6..5964bed40ffb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -674,6 +674,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type) return BPF_DYNPTR_TYPE_SKB; case DYNPTR_TYPE_XDP: return BPF_DYNPTR_TYPE_XDP; + case DYNPTR_TYPE_SKB_META: + return BPF_DYNPTR_TYPE_SKB_META; default: return BPF_DYNPTR_TYPE_INVALID; } @@ -690,6 +692,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type) return DYNPTR_TYPE_SKB; case BPF_DYNPTR_TYPE_XDP: return DYNPTR_TYPE_XDP; + case BPF_DYNPTR_TYPE_SKB_META: + return DYNPTR_TYPE_SKB_META; default: return 0; } @@ -2274,7 +2278,8 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg) { return base_type(reg->type) == PTR_TO_MEM && - (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP); + (reg->type & + (DYNPTR_TYPE_SKB | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META)); } /* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ @@ -11641,7 +11646,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (dynptr_type == BPF_DYNPTR_TYPE_INVALID) return -EFAULT; - if (dynptr_type == BPF_DYNPTR_TYPE_SKB) + if (dynptr_type == BPF_DYNPTR_TYPE_SKB || + dynptr_type == BPF_DYNPTR_TYPE_SKB_META) /* this will trigger clear_all_pkt_pointers(), which will * invalidate all dynptr slices associated with the skb */ @@ -12228,6 +12234,7 @@ enum special_kfunc_type { KF_bpf_rbtree_right, KF_bpf_dynptr_from_skb, KF_bpf_dynptr_from_xdp, + KF_bpf_dynptr_from_skb_meta, KF_bpf_dynptr_slice, KF_bpf_dynptr_slice_rdwr, KF_bpf_dynptr_clone, @@ -12277,9 +12284,11 @@ BTF_ID(func, bpf_rbtree_right) #ifdef CONFIG_NET BTF_ID(func, bpf_dynptr_from_skb) BTF_ID(func, bpf_dynptr_from_xdp) +BTF_ID(func, bpf_dynptr_from_skb_meta) #else BTF_ID_UNUSED BTF_ID_UNUSED +BTF_ID_UNUSED #endif BTF_ID(func, bpf_dynptr_slice) BTF_ID(func, bpf_dynptr_slice_rdwr) @@ -13253,6 +13262,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ dynptr_arg_type |= DYNPTR_TYPE_SKB; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp]) { dynptr_arg_type |= DYNPTR_TYPE_XDP; + } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb_meta]) { + dynptr_arg_type |= DYNPTR_TYPE_SKB_META; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_clone] && (dynptr_arg_type & MEM_UNINIT)) { enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; diff --git a/net/core/filter.c b/net/core/filter.c index da391e2b0788..31b4b50dbadf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -12007,6 +12007,36 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, return 0; } +/** + * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. + * @skb_: socket buffer carrying the metadata + * @flags: future use, must be zero + * @ptr__uninit: dynptr to initialize + * + * Set up a dynptr for access to the metadata area earlier allocated from the + * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to + * &__sk_buff->data_meta. + * + * Return: + * * %0 - dynptr ready to use + * * %-EINVAL - invalid flags, dynptr set to null + */ +__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)skb_; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + + return 0; +} + __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { @@ -12181,6 +12211,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) + BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) @@ -12202,6 +12236,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .set = &bpf_kfunc_check_set_skb, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb_meta, +}; + static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, @@ -12237,6 +12276,8 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); -- cgit v1.2.3 From 6877cd392baecf816c2ba896a9d42874628004a5 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Thu, 14 Aug 2025 11:59:28 +0200 Subject: bpf: Enable read/write access to skb metadata through a dynptr Now that we can create a dynptr to skb metadata, make reads to the metadata area possible with bpf_dynptr_read() or through a bpf_dynptr_slice(), and make writes to the metadata area possible with bpf_dynptr_write() or through a bpf_dynptr_slice_rdwr(). Note that for cloned skbs which share data with the original, we limit the skb metadata dynptr to be read-only since we don't unclone on a bpf_dynptr_write to metadata. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250814-skb-metadata-thru-dynptr-v7-2-8a39e636e0fb@cloudflare.com --- include/linux/filter.h | 6 ++++++ kernel/bpf/helpers.c | 10 +++++++--- net/core/filter.c | 16 ++++++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1e7fd3ee759e..9ed21b65e2e9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1784,6 +1784,7 @@ int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1818,6 +1819,11 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi unsigned long len, bool flush) { } + +static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return NULL; +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9552b32208c5..cdffd74ddbe6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1781,7 +1781,8 @@ static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *s case BPF_DYNPTR_TYPE_XDP: return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len); case BPF_DYNPTR_TYPE_SKB_META: - return -EOPNOTSUPP; /* not implemented */ + memmove(dst, bpf_skb_meta_pointer(src->data, src->offset + offset), len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type); return -EFAULT; @@ -1839,7 +1840,10 @@ int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, return -EINVAL; return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len); case BPF_DYNPTR_TYPE_SKB_META: - return -EOPNOTSUPP; /* not implemented */ + if (flags) + return -EINVAL; + memmove(bpf_skb_meta_pointer(dst->data, dst->offset + offset), src, len); + return 0; default: WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type); return -EFAULT; @@ -2716,7 +2720,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset, return buffer__opt; } case BPF_DYNPTR_TYPE_SKB_META: - return NULL; /* not implemented */ + return bpf_skb_meta_pointer(ptr->data, ptr->offset + offset); default: WARN_ONCE(true, "unknown dynptr type %d\n", type); return NULL; diff --git a/net/core/filter.c b/net/core/filter.c index 31b4b50dbadf..63f3baee2daf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -11990,6 +11990,16 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return func; } +/** + * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. + * @skb: socket buffer carrying the metadata + * @offset: offset into the metadata area, must be <= skb_metadata_len() + */ +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) @@ -12017,6 +12027,9 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to * &__sk_buff->data_meta. * + * If passed @skb_ is a clone which shares the data with the original, the + * dynptr will be read-only. This limitation may be lifted in the future. + * * Return: * * %0 - dynptr ready to use * * %-EINVAL - invalid flags, dynptr set to null @@ -12034,6 +12047,9 @@ __bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + if (skb_cloned(skb)) + bpf_dynptr_set_rdonly(ptr); + return 0; } -- cgit v1.2.3 From 807221d3c5ff6e3c91ff57bc82a0b7a541462e20 Mon Sep 17 00:00:00 2001 From: Ricky Wu Date: Tue, 12 Aug 2025 14:35:21 +0800 Subject: misc: rtsx_pci: Add separate CD/WP pin polarity reversal support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, the Card Detect (CD) and Write Protect (WP) pins shared the same reverse polarity setting in the configuration space. This meant both signals were reversed together, without the ability to configure them individually. This patch introduces two new parameters: sd_cd_reverse_en – enable reverse polarity for the CD pin. sd_wp_reverse_en – enable reverse polarity for the WP pin. With this change, the controller can now support: 1.Reversing both CD and WP pins together (original behavior). 2.Reversing CD and WP pins separately (newly added behavior), if supported by the configuration space. This provides greater flexibility when dealing with devices that have independent polarity requirements for CD and WP pins. Signed-off-by: Ricky Wu Link: https://lore.kernel.org/r/20250812063521.2427696-1-ricky_wu@realtek.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cardreader/rts5227.c | 13 ++++++++++--- drivers/misc/cardreader/rts5228.c | 12 ++++++++++-- drivers/misc/cardreader/rts5249.c | 16 +++++++++++++--- drivers/misc/cardreader/rts5264.c | 20 ++++++++++++++++---- drivers/misc/cardreader/rts5264.h | 1 + drivers/misc/cardreader/rtsx_pcr.h | 2 ++ include/linux/rtsx_pci.h | 2 ++ 7 files changed, 54 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/misc/cardreader/rts5227.c b/drivers/misc/cardreader/rts5227.c index cd512284bfb3..46444bb47f65 100644 --- a/drivers/misc/cardreader/rts5227.c +++ b/drivers/misc/cardreader/rts5227.c @@ -79,6 +79,10 @@ static void rts5227_fetch_vendor_settings(struct rtsx_pcr *pcr) pcr->sd30_drive_sel_3v3 = rtsx_reg_to_sd30_drive_sel_3v3(reg); if (rtsx_reg_check_reverse_socket(reg)) pcr->flags |= PCR_REVERSE_SOCKET; + if (rtsx_reg_check_cd_reverse(reg)) + pcr->option.sd_cd_reverse_en = 1; + if (rtsx_reg_check_wp_reverse(reg)) + pcr->option.sd_wp_reverse_en = 1; } static void rts5227_init_from_cfg(struct rtsx_pcr *pcr) @@ -127,8 +131,10 @@ static int rts5227_extra_init_hw(struct rtsx_pcr *pcr) /* Configure force_clock_req */ if (pcr->flags & PCR_REVERSE_SOCKET) rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x30, 0x30); - else - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x30, 0x00); + else { + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x20, option->sd_cd_reverse_en << 5); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x10, option->sd_wp_reverse_en << 4); + } if (CHK_PCI_PID(pcr, 0x522A)) rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, RTS522A_AUTOLOAD_CFG1, @@ -350,6 +356,8 @@ void rts5227_init_params(struct rtsx_pcr *pcr) pcr->ms_pull_ctl_disable_tbl = rts5227_ms_pull_ctl_disable_tbl; pcr->reg_pm_ctrl3 = PM_CTRL3; + pcr->option.sd_cd_reverse_en = 0; + pcr->option.sd_wp_reverse_en = 0; } static int rts522a_optimize_phy(struct rtsx_pcr *pcr) @@ -508,5 +516,4 @@ void rts522a_init_params(struct rtsx_pcr *pcr) pcr->hw_param.interrupt_en |= SD_OC_INT_EN; pcr->hw_param.ocp_glitch = SD_OCP_GLITCH_10M; pcr->option.sd_800mA_ocp_thd = RTS522A_OCP_THD_800; - } diff --git a/drivers/misc/cardreader/rts5228.c b/drivers/misc/cardreader/rts5228.c index 0c7f10bcf6f1..db7e735ac24f 100644 --- a/drivers/misc/cardreader/rts5228.c +++ b/drivers/misc/cardreader/rts5228.c @@ -84,6 +84,10 @@ static void rtsx5228_fetch_vendor_settings(struct rtsx_pcr *pcr) pcr->sd30_drive_sel_3v3 = rtsx_reg_to_sd30_drive_sel_3v3(reg); if (rtsx_reg_check_reverse_socket(reg)) pcr->flags |= PCR_REVERSE_SOCKET; + if (rtsx_reg_check_cd_reverse(reg)) + pcr->option.sd_cd_reverse_en = 1; + if (rtsx_reg_check_wp_reverse(reg)) + pcr->option.sd_wp_reverse_en = 1; } static int rts5228_optimize_phy(struct rtsx_pcr *pcr) @@ -432,8 +436,10 @@ static int rts5228_extra_init_hw(struct rtsx_pcr *pcr) if (pcr->flags & PCR_REVERSE_SOCKET) rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x30); - else - rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x00); + else { + rtsx_pci_write_register(pcr, PETXCFG, 0x20, option->sd_cd_reverse_en << 5); + rtsx_pci_write_register(pcr, PETXCFG, 0x10, option->sd_wp_reverse_en << 4); + } /* * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced @@ -720,4 +726,6 @@ void rts5228_init_params(struct rtsx_pcr *pcr) hw_param->interrupt_en |= SD_OC_INT_EN; hw_param->ocp_glitch = SD_OCP_GLITCH_800U; option->sd_800mA_ocp_thd = RTS5228_LDO1_OCP_THD_930; + option->sd_cd_reverse_en = 0; + option->sd_wp_reverse_en = 0; } diff --git a/drivers/misc/cardreader/rts5249.c b/drivers/misc/cardreader/rts5249.c index 6c81040e18be..38aefd8db452 100644 --- a/drivers/misc/cardreader/rts5249.c +++ b/drivers/misc/cardreader/rts5249.c @@ -60,6 +60,7 @@ static void rtsx_base_fetch_vendor_settings(struct rtsx_pcr *pcr) pci_read_config_dword(pdev, PCR_SETTING_REG1, ®); pcr_dbg(pcr, "Cfg 0x%x: 0x%x\n", PCR_SETTING_REG1, reg); + pci_write_config_dword(pdev, 0x718, 0x0007C000); if (!rtsx_vendor_setting_valid(reg)) { pcr_dbg(pcr, "skip fetch vendor setting\n"); @@ -82,6 +83,10 @@ static void rtsx_base_fetch_vendor_settings(struct rtsx_pcr *pcr) pcr->sd30_drive_sel_3v3 = rtsx_reg_to_sd30_drive_sel_3v3(reg); if (rtsx_reg_check_reverse_socket(reg)) pcr->flags |= PCR_REVERSE_SOCKET; + if (rtsx_reg_check_cd_reverse(reg)) + pcr->option.sd_cd_reverse_en = 1; + if (rtsx_reg_check_wp_reverse(reg)) + pcr->option.sd_wp_reverse_en = 1; } static void rts5249_init_from_cfg(struct rtsx_pcr *pcr) @@ -254,9 +259,11 @@ static int rts5249_extra_init_hw(struct rtsx_pcr *pcr) /* Configure driving */ rts5249_fill_driving(pcr, OUTPUT_3V3); if (pcr->flags & PCR_REVERSE_SOCKET) - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0xB0, 0xB0); - else - rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0xB0, 0x80); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x30, 0x30); + else { + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x20, option->sd_cd_reverse_en << 5); + rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PETXCFG, 0x10, option->sd_wp_reverse_en << 4); + } rtsx_pci_send_cmd(pcr, CMD_TIMEOUT_DEF); @@ -572,6 +579,9 @@ void rts5249_init_params(struct rtsx_pcr *pcr) option->ltr_l1off_sspwrgate = LTR_L1OFF_SSPWRGATE_5249_DEF; option->ltr_l1off_snooze_sspwrgate = LTR_L1OFF_SNOOZE_SSPWRGATE_5249_DEF; + + option->sd_cd_reverse_en = 0; + option->sd_wp_reverse_en = 0; } static int rts524a_write_phy(struct rtsx_pcr *pcr, u8 addr, u16 val) diff --git a/drivers/misc/cardreader/rts5264.c b/drivers/misc/cardreader/rts5264.c index d050c9fff7ac..99a2d5ea6421 100644 --- a/drivers/misc/cardreader/rts5264.c +++ b/drivers/misc/cardreader/rts5264.c @@ -527,8 +527,16 @@ static void rts5264_init_from_hw(struct rtsx_pcr *pcr) pcr->rtd3_en = rts5264_reg_to_rtd3(lval2); - if (rts5264_reg_check_reverse_socket(lval2)) - pcr->flags |= PCR_REVERSE_SOCKET; + if (rts5264_reg_check_reverse_socket(lval2)) { + if (is_version_higher_than(pcr, PID_5264, RTS5264_IC_VER_B)) + pcr->option.sd_cd_reverse_en = 1; + else + pcr->flags |= PCR_REVERSE_SOCKET; + } + + if (rts5264_reg_check_wp_reverse(lval2) && + is_version_higher_than(pcr, PID_5264, RTS5264_IC_VER_B)) + pcr->option.sd_wp_reverse_en = 1; pci_read_config_dword(pdev, setting_reg1, &lval1); pcr_dbg(pcr, "Cfg 0x%x: 0x%x\n", setting_reg1, lval1); @@ -622,8 +630,10 @@ static int rts5264_extra_init_hw(struct rtsx_pcr *pcr) if (pcr->flags & PCR_REVERSE_SOCKET) rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x30); - else - rtsx_pci_write_register(pcr, PETXCFG, 0x30, 0x00); + else { + rtsx_pci_write_register(pcr, PETXCFG, 0x20, option->sd_cd_reverse_en << 5); + rtsx_pci_write_register(pcr, PETXCFG, 0x10, option->sd_wp_reverse_en << 4); + } /* * If u_force_clkreq_0 is enabled, CLKREQ# PIN will be forced @@ -957,4 +967,6 @@ void rts5264_init_params(struct rtsx_pcr *pcr) hw_param->interrupt_en |= (SD_OC_INT_EN | SD_OVP_INT_EN); hw_param->ocp_glitch = SD_OCP_GLITCH_800U | SDVIO_OCP_GLITCH_800U; option->sd_800mA_ocp_thd = RTS5264_LDO1_OCP_THD_1150; + option->sd_cd_reverse_en = 0; + option->sd_wp_reverse_en = 0; } diff --git a/drivers/misc/cardreader/rts5264.h b/drivers/misc/cardreader/rts5264.h index f3e81daa708d..611ee253367c 100644 --- a/drivers/misc/cardreader/rts5264.h +++ b/drivers/misc/cardreader/rts5264.h @@ -14,6 +14,7 @@ #define rts5264_reg_to_aspm(reg) \ (((~(reg) >> 28) & 0x02) | (((reg) >> 28) & 0x01)) #define rts5264_reg_check_reverse_socket(reg) ((reg) & 0x04) +#define rts5264_reg_check_wp_reverse(reg) ((reg) & 0x8000) #define rts5264_reg_to_sd30_drive_sel_1v8(reg) (((reg) >> 22) & 0x03) #define rts5264_reg_to_sd30_drive_sel_3v3(reg) (((reg) >> 16) & 0x03) #define rts5264_reg_to_rtd3(reg) ((reg) & 0x08) diff --git a/drivers/misc/cardreader/rtsx_pcr.h b/drivers/misc/cardreader/rtsx_pcr.h index 8e5951b61143..40562ff2be13 100644 --- a/drivers/misc/cardreader/rtsx_pcr.h +++ b/drivers/misc/cardreader/rtsx_pcr.h @@ -100,6 +100,8 @@ static inline u8 map_sd_drive(int idx) #define rtsx_reg_to_sd30_drive_sel_3v3(reg) (((reg) >> 5) & 0x03) #define rtsx_reg_to_card_drive_sel(reg) ((((reg) >> 25) & 0x01) << 6) #define rtsx_reg_check_reverse_socket(reg) ((reg) & 0x4000) +#define rtsx_reg_check_cd_reverse(reg) ((reg) & 0x800000) +#define rtsx_reg_check_wp_reverse(reg) ((reg) & 0x400000) #define rts5209_reg_to_aspm(reg) (((reg) >> 5) & 0x03) #define rts5209_reg_check_ms_pmos(reg) (!((reg) & 0x08)) #define rts5209_reg_to_sd30_drive_sel_1v8(reg) (((reg) >> 3) & 0x07) diff --git a/include/linux/rtsx_pci.h b/include/linux/rtsx_pci.h index 3b4c36705a9b..3c5689356004 100644 --- a/include/linux/rtsx_pci.h +++ b/include/linux/rtsx_pci.h @@ -1160,6 +1160,8 @@ struct rtsx_cr_option { bool ocp_en; u8 sd_400mA_ocp_thd; u8 sd_800mA_ocp_thd; + u8 sd_cd_reverse_en; + u8 sd_wp_reverse_en; }; /* -- cgit v1.2.3 From f5597840ac907858ad2a462b00e4a68fd199121e Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 14 Jul 2025 23:34:14 +0800 Subject: char: misc: Disallow registering miscdevice whose minor > MISC_DYNAMIC_MINOR Currently, It is allowed to register miscdevice with minor > 255 which is defined by macro MISC_DYNAMIC_MINOR, and cause: - Chaos regarding division and management of minor codes. - Registering failure if the minor was allocated to other dynamic request. Fortunately, in-kernel users have not had such usage yet. Fix by refusing to register miscdevice whose minor > 255. Also bring in a very simple minor code space division and management: < 255 : Fixed minor code == 255 : Indicator to request dynamic minor code > 255 : Dynamic minor code requested, 1048320 minor codes totally And all fixed minors allocated should be registered in 'linux/miscdevice.h' Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250714-rfc_miscdev-v6-3-2ed949665bde@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/misc.c | 6 ++++++ include/linux/miscdevice.h | 8 ++++++++ 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/drivers/char/misc.c b/drivers/char/misc.c index 558302a64dd9..b8e66466184f 100644 --- a/drivers/char/misc.c +++ b/drivers/char/misc.c @@ -210,6 +210,12 @@ int misc_register(struct miscdevice *misc) int err = 0; bool is_dynamic = (misc->minor == MISC_DYNAMIC_MINOR); + if (misc->minor > MISC_DYNAMIC_MINOR) { + pr_err("Invalid fixed minor %d for miscdevice '%s'\n", + misc->minor, misc->name); + return -EINVAL; + } + INIT_LIST_HEAD(&misc->list); mutex_lock(&misc_mtx); diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 3e6deb00fc85..565b88efeb23 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -71,6 +71,14 @@ #define USERIO_MINOR 240 #define VHOST_VSOCK_MINOR 241 #define RFKILL_MINOR 242 + +/* + * Misc char device minor code space division related to below macro: + * + * < 255 : Fixed minor code + * == 255 : Indicator to request dynamic minor code + * > 255 : Dynamic minor code requested, 1048320 minor codes totally. + */ #define MISC_DYNAMIC_MINOR 255 struct miscdevice { -- cgit v1.2.3 From d7f8d0758b975db8406c91cf242d46cd9611ba3e Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Mon, 14 Jul 2025 23:34:18 +0800 Subject: char: misc: Register fixed minor EISA_EEPROM_MINOR in linux/miscdevice.h Move fixed minor EISA_EEPROM_MINOR definition to linux/miscdevice.h. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20250714-rfc_miscdev-v6-7-2ed949665bde@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/parisc/eisa_eeprom.c | 2 -- include/linux/miscdevice.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/parisc/eisa_eeprom.c b/drivers/parisc/eisa_eeprom.c index 443b15422fc1..601cbb22574f 100644 --- a/drivers/parisc/eisa_eeprom.c +++ b/drivers/parisc/eisa_eeprom.c @@ -15,8 +15,6 @@ #include #include -#define EISA_EEPROM_MINOR 241 - static loff_t eisa_eeprom_llseek(struct file *file, loff_t offset, int origin) { return fixed_size_llseek(file, offset, origin, HPEE_MAX_LENGTH); diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 565b88efeb23..7d0aa718499c 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -70,6 +70,7 @@ #define UHID_MINOR 239 #define USERIO_MINOR 240 #define VHOST_VSOCK_MINOR 241 +#define EISA_EEPROM_MINOR 241 #define RFKILL_MINOR 242 /* -- cgit v1.2.3 From 1d6249c1ce826fcf03c695973095eb4a50fb7fd2 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 11 Aug 2025 11:13:35 +0200 Subject: sysfs: remove bin_attribute::read_new/write_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These transitional fields are now unused and unnecessary. Remove them and their logic in the sysfs core. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250811-sysfs-const-bin_attr-final-v4-1-7b6053fd58bb@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 22 +++++----------------- include/linux/sysfs.h | 4 ---- 2 files changed, 5 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1ca143d2f22a..3825e780cc58 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -97,12 +97,9 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, count = size - pos; } - if (!battr->read && !battr->read_new) + if (!battr->read) return -EIO; - if (battr->read_new) - return battr->read_new(of->file, kobj, battr, buf, pos, count); - return battr->read(of->file, kobj, battr, buf, pos, count); } @@ -161,12 +158,9 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, if (!count) return 0; - if (!battr->write && !battr->write_new) + if (!battr->write) return -EIO; - if (battr->write_new) - return battr->write_new(of->file, kobj, battr, buf, pos, count); - return battr->write(of->file, kobj, battr, buf, pos, count); } @@ -335,19 +329,13 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, const struct kernfs_ops *ops; struct kernfs_node *kn; - if (battr->read && battr->read_new) - return -EINVAL; - - if (battr->write && battr->write_new) - return -EINVAL; - if (battr->mmap) ops = &sysfs_bin_kfops_mmap; - else if ((battr->read || battr->read_new) && (battr->write || battr->write_new)) + else if (battr->read && battr->write) ops = &sysfs_bin_kfops_rw; - else if (battr->read || battr->read_new) + else if (battr->read) ops = &sysfs_bin_kfops_ro; - else if (battr->write || battr->write_new) + else if (battr->write) ops = &sysfs_bin_kfops_wo; else ops = &sysfs_file_kfops_empty; diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index f418aae4f113..7544f6d81c05 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -308,12 +308,8 @@ struct bin_attribute { struct address_space *(*f_mapping)(void); ssize_t (*read)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); - ssize_t (*read_new)(struct file *, struct kobject *, const struct bin_attribute *, - char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, const struct bin_attribute *, char *, loff_t, size_t); - ssize_t (*write_new)(struct file *, struct kobject *, - const struct bin_attribute *, char *, loff_t, size_t); loff_t (*llseek)(struct file *, struct kobject *, const struct bin_attribute *, loff_t, int); int (*mmap)(struct file *, struct kobject *, const struct bin_attribute *attr, -- cgit v1.2.3 From 44d454fcffa8b08d6d66df132121c1d387fa85db Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 11 Aug 2025 11:13:36 +0200 Subject: sysfs: remove attribute_group::bin_attrs_new MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This transitional field is now unused and unnecessary. Remove it. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250811-sysfs-const-bin_attr-final-v4-2-7b6053fd58bb@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 7544f6d81c05..9a25a2911652 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -106,10 +106,7 @@ struct attribute_group { const struct bin_attribute *, int); struct attribute **attrs; - union { - const struct bin_attribute *const *bin_attrs; - const struct bin_attribute *const *bin_attrs_new; - }; + const struct bin_attribute *const *bin_attrs; }; #define SYSFS_PREALLOC 010000 @@ -293,7 +290,7 @@ __ATTRIBUTE_GROUPS(_name) #define BIN_ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ - .bin_attrs_new = _name##_attrs, \ + .bin_attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) -- cgit v1.2.3 From 74f44ad07d1063933c237a7db16f6a4036643d60 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 30 Jul 2025 17:46:14 +0100 Subject: mmc: tmio: Add 64-bit read/write support for SD_BUF0 in polling mode As per the RZ/{G2L,G3E} HW manual SD_BUF0 can be accessed by 16/32/64 bits. Most of the data transfer in SD/SDIO/eMMC mode is more than 8 bytes. During testing it is found that, if the DMA buffer is not aligned to 128 bit it fallback to PIO mode. In such cases, 64-bit access is much more efficient than the current 16-bit. Tested-by: Wolfram Sang Reviewed-by: Wolfram Sang Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20250730164618.233117-2-biju.das.jz@bp.renesas.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc.h | 15 +++++++++++++++ drivers/mmc/host/tmio_mmc_core.c | 33 +++++++++++++++++++++++++++++++++ include/linux/platform_data/tmio.h | 3 +++ 3 files changed, 51 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index d730b7633ae1..c8cdb1c0722e 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -242,6 +243,20 @@ static inline void sd_ctrl_read32_rep(struct tmio_mmc_host *host, int addr, ioread32_rep(host->ctl + (addr << host->bus_shift), buf, count); } +#ifdef CONFIG_64BIT +static inline void sd_ctrl_read64_rep(struct tmio_mmc_host *host, int addr, + u64 *buf, int count) +{ + readsq(host->ctl + (addr << host->bus_shift), buf, count); +} + +static inline void sd_ctrl_write64_rep(struct tmio_mmc_host *host, int addr, + const u64 *buf, int count) +{ + writesq(host->ctl + (addr << host->bus_shift), buf, count); +} +#endif + static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val) { diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 21c2f9095bac..775e0d9353d5 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -349,6 +349,39 @@ static void tmio_mmc_transfer_data(struct tmio_mmc_host *host, /* * Transfer the data */ +#ifdef CONFIG_64BIT + if (host->pdata->flags & TMIO_MMC_64BIT_DATA_PORT) { + u64 *buf64 = (u64 *)buf; + u64 data = 0; + + if (count >= 8) { + if (is_read) + sd_ctrl_read64_rep(host, CTL_SD_DATA_PORT, + buf64, count >> 3); + else + sd_ctrl_write64_rep(host, CTL_SD_DATA_PORT, + buf64, count >> 3); + } + + /* if count was multiple of 8 */ + if (!(count & 0x7)) + return; + + buf64 += count >> 3; + count %= 8; + + if (is_read) { + sd_ctrl_read64_rep(host, CTL_SD_DATA_PORT, &data, 1); + memcpy(buf64, &data, count); + } else { + memcpy(&data, buf64, count); + sd_ctrl_write64_rep(host, CTL_SD_DATA_PORT, &data, 1); + } + + return; + } +#endif + if (host->pdata->flags & TMIO_MMC_32BIT_DATA_PORT) { u32 data = 0; u32 *buf32 = (u32 *)buf; diff --git a/include/linux/platform_data/tmio.h b/include/linux/platform_data/tmio.h index b060124ba1ae..426291713b83 100644 --- a/include/linux/platform_data/tmio.h +++ b/include/linux/platform_data/tmio.h @@ -47,6 +47,9 @@ /* Some controllers have a CBSY bit */ #define TMIO_MMC_HAVE_CBSY BIT(11) +/* Some controllers have a 64-bit wide data port register */ +#define TMIO_MMC_64BIT_DATA_PORT BIT(12) + struct tmio_mmc_data { void *chan_priv_tx; void *chan_priv_rx; -- cgit v1.2.3 From d2e6fb2c31a07f34e5e7533df11431cb0d2ecf9f Mon Sep 17 00:00:00 2001 From: Ricky Wu Date: Tue, 12 Aug 2025 11:08:11 +0800 Subject: misc: rtsx: usb card reader: add OCP support This patch adds support for Over Current Protection (OCP) to the Realtek USB card reader driver. The OCP mechanism protects the hardware by detecting and handling current overload conditions. This implementation includes: - Register configurations to enable OCP monitoring. - Handling of OCP interrupt events and associated error reporting. - Card power management changes in response to OCP triggers. This enhancement improves the robustness of the driver when operating in environments where electrical anomalies may occur, particularly with SD and MS card interfaces. Signed-off-by: Ricky Wu Link: https://lore.kernel.org/r/20250812030811.2426112-1-ricky_wu@realtek.com Signed-off-by: Ulf Hansson --- drivers/memstick/host/rtsx_usb_ms.c | 5 ++++- drivers/misc/cardreader/rtsx_usb.c | 7 +++++++ drivers/mmc/host/rtsx_usb_sdmmc.c | 33 ++++++++++++++++++++++++++++++--- include/linux/rtsx_usb.h | 11 +++++++++++ 4 files changed, 52 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/memstick/host/rtsx_usb_ms.c b/drivers/memstick/host/rtsx_usb_ms.c index 3878136227e4..9389e9643c24 100644 --- a/drivers/memstick/host/rtsx_usb_ms.c +++ b/drivers/memstick/host/rtsx_usb_ms.c @@ -216,7 +216,10 @@ static int ms_power_off(struct rtsx_usb_ms *host) rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_CLK_EN, MS_CLK_EN, 0); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_OE, MS_OUTPUT_EN, 0); - + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, + POWER_MASK, POWER_OFF); + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, + POWER_MASK | LDO3318_PWR_MASK, POWER_OFF | LDO_SUSPEND); err = rtsx_usb_send_cmd(ucr, MODE_C, 100); if (err < 0) return err; diff --git a/drivers/misc/cardreader/rtsx_usb.c b/drivers/misc/cardreader/rtsx_usb.c index d007a4455ce5..1830e9ed2521 100644 --- a/drivers/misc/cardreader/rtsx_usb.c +++ b/drivers/misc/cardreader/rtsx_usb.c @@ -552,6 +552,10 @@ static int rtsx_usb_reset_chip(struct rtsx_ucr *ucr) ret = rtsx_usb_send_cmd(ucr, MODE_C, 100); if (ret) return ret; + /* config OCP */ + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_DETECT_EN, MS_OCP_DETECT_EN); + rtsx_usb_write_register(ucr, OCPPARA1, 0xF0, 0x50); + rtsx_usb_write_register(ucr, OCPPARA2, 0x7, 0x3); /* config non-crystal mode */ rtsx_usb_read_register(ucr, CFG_MODE, &val); @@ -722,6 +726,9 @@ static int rtsx_usb_suspend(struct usb_interface *intf, pm_message_t message) if (val & (SD_CD | MS_CD)) { device_for_each_child(&intf->dev, NULL, rtsx_usb_resume_child); return -EAGAIN; + } else { + /* if the card does not exists, clear OCP status */ + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_CLEAR, MS_OCP_CLEAR); } } else { /* There is an ongoing operation*/ diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index c5f6b9df066b..e1ed39c657c3 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -48,7 +48,7 @@ struct rtsx_usb_sdmmc { bool ddr_mode; unsigned char power_mode; - + u16 ocp_stat; #ifdef RTSX_USB_USE_LEDS_CLASS struct led_classdev led; char led_name[32]; @@ -785,6 +785,9 @@ static int sdmmc_get_cd(struct mmc_host *mmc) mutex_unlock(&ucr->dev_mutex); + /* get OCP status */ + host->ocp_stat = (val >> 4) & 0x03; + /* Treat failed detection as non-exist */ if (err) goto no_card; @@ -795,6 +798,11 @@ static int sdmmc_get_cd(struct mmc_host *mmc) } no_card: + /* clear OCP status */ + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + rtsx_usb_write_register(ucr, OCPCTL, MS_OCP_CLEAR, MS_OCP_CLEAR); + host->ocp_stat = 0; + } host->card_exist = false; return 0; } @@ -818,7 +826,11 @@ static void sdmmc_request(struct mmc_host *mmc, struct mmc_request *mrq) cmd->error = -ENOMEDIUM; goto finish_detect_card; } - + /* check OCP stat */ + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + cmd->error = -ENOMEDIUM; + goto finish_detect_card; + } mutex_lock(&ucr->dev_mutex); mutex_lock(&host->host_mutex); @@ -952,6 +964,10 @@ static int sd_power_on(struct rtsx_usb_sdmmc *host) struct rtsx_ucr *ucr = host->ucr; int err; + if (host->ocp_stat & (MS_OCP_NOW | MS_OCP_EVER)) { + dev_dbg(sdmmc_dev(host), "over current\n"); + return -EIO; + } dev_dbg(sdmmc_dev(host), "%s\n", __func__); rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_SELECT, 0x07, SD_MOD_SEL); @@ -977,9 +993,19 @@ static int sd_power_on(struct rtsx_usb_sdmmc *host) usleep_range(800, 1000); + rtsx_usb_init_cmd(ucr); + /* WA OCP issue: after OCP, there were problems with reopen card power */ + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, POWER_MASK, POWER_ON); + rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, FPDCTL, SSC_POWER_MASK, SSC_POWER_DOWN); + err = rtsx_usb_send_cmd(ucr, MODE_C, 100); + if (err) + return err; + msleep(20); + rtsx_usb_write_register(ucr, FPDCTL, SSC_POWER_MASK, SSC_POWER_ON); + usleep_range(180, 200); rtsx_usb_init_cmd(ucr); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_PWR_CTL, - POWER_MASK|LDO3318_PWR_MASK, POWER_ON|LDO_ON); + LDO3318_PWR_MASK, LDO_ON); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CARD_OE, SD_OUTPUT_EN, SD_OUTPUT_EN); @@ -1332,6 +1358,7 @@ static void rtsx_usb_init_host(struct rtsx_usb_sdmmc *host) mmc->max_req_size = 524288; host->power_mode = MMC_POWER_OFF; + host->ocp_stat = 0; } static int rtsx_usb_sdmmc_drv_probe(struct platform_device *pdev) diff --git a/include/linux/rtsx_usb.h b/include/linux/rtsx_usb.h index f267a06c6b1e..276b509c03e3 100644 --- a/include/linux/rtsx_usb.h +++ b/include/linux/rtsx_usb.h @@ -99,6 +99,17 @@ extern int rtsx_usb_card_exclusive_check(struct rtsx_ucr *ucr, int card); #define CD_MASK (SD_CD | MS_CD | XD_CD) #define SD_WP 0x08 +/* OCPCTL */ +#define MS_OCP_DETECT_EN 0x08 +#define MS_OCP_INT_EN 0x04 +#define MS_OCP_INT_CLR 0x02 +#define MS_OCP_CLEAR 0x01 + +/* OCPSTAT */ +#define MS_OCP_DETECT 0x80 +#define MS_OCP_NOW 0x02 +#define MS_OCP_EVER 0x01 + /* reader command field offset & parameters */ #define READ_REG_CMD 0 #define WRITE_REG_CMD 1 -- cgit v1.2.3 From 99e6cc80d5ce5af5781f84d20e4f3478d66ee8ee Mon Sep 17 00:00:00 2001 From: Benoît Monin Date: Mon, 18 Aug 2025 16:02:50 +0200 Subject: mmc: core: add mmc_read_tuning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide a function to the MMC hosts to read some blocks of data as part of their tuning. This function only returns the status of the read operation, not the data read. Signed-off-by: Benoît Monin Link: https://lore.kernel.org/r/20250818-mobileye-emmc-for-upstream-4-v4-5-34ecb3995e96@bootlin.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc_ops.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mmc/host.h | 1 + 2 files changed, 73 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c index 66283825513c..a952cc8265af 100644 --- a/drivers/mmc/core/mmc_ops.c +++ b/drivers/mmc/core/mmc_ops.c @@ -1077,3 +1077,75 @@ int mmc_sanitize(struct mmc_card *card, unsigned int timeout_ms) return err; } EXPORT_SYMBOL_GPL(mmc_sanitize); + +/** + * mmc_read_tuning() - read data blocks from the mmc + * @host: mmc host doing the read + * @blksz: data block size + * @blocks: number of blocks to read + * + * Read one or more blocks of data from the beginning of the mmc. This is a + * low-level helper for tuning operation. It is assumed that CMD23 can be used + * for multi-block read if the host supports it. + * + * Note: Allocate and free a temporary buffer to store the data read. The data + * is not available outside of the function, only the status of the read + * operation. + * + * Return: 0 in case of success, otherwise -EIO / -ENOMEM / -E2BIG + */ +int mmc_read_tuning(struct mmc_host *host, unsigned int blksz, unsigned int blocks) +{ + struct mmc_request mrq = {}; + struct mmc_command sbc = {}; + struct mmc_command cmd = {}; + struct mmc_command stop = {}; + struct mmc_data data = {}; + struct scatterlist sg; + void *buf; + unsigned int len; + + if (blocks > 1) { + if (mmc_host_can_cmd23(host)) { + mrq.sbc = &sbc; + sbc.opcode = MMC_SET_BLOCK_COUNT; + sbc.arg = blocks; + sbc.flags = MMC_RSP_R1 | MMC_CMD_AC; + } + cmd.opcode = MMC_READ_MULTIPLE_BLOCK; + mrq.stop = &stop; + stop.opcode = MMC_STOP_TRANSMISSION; + stop.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC; + } else { + cmd.opcode = MMC_READ_SINGLE_BLOCK; + } + + mrq.cmd = &cmd; + cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC; + + mrq.data = &data; + data.flags = MMC_DATA_READ; + data.blksz = blksz; + data.blocks = blocks; + data.blk_addr = 0; + data.sg = &sg; + data.sg_len = 1; + data.timeout_ns = 1000000000; + + if (check_mul_overflow(blksz, blocks, &len)) + return -E2BIG; + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + sg_init_one(&sg, buf, len); + + mmc_wait_for_req(host, &mrq); + kfree(buf); + + if (sbc.error || cmd.error || data.error) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_GPL(mmc_read_tuning); diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 68f09a955a90..5ed5d203de23 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -743,5 +743,6 @@ int mmc_send_status(struct mmc_card *card, u32 *status); int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error); int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode); int mmc_get_ext_csd(struct mmc_card *card, u8 **new_ext_csd); +int mmc_read_tuning(struct mmc_host *host, unsigned int blksz, unsigned int blocks); #endif /* LINUX_MMC_HOST_H */ -- cgit v1.2.3 From c3f0c02997c7f8489fec259e28e0e04e9811edac Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 18 Aug 2025 08:40:26 -0700 Subject: net: Add skb_dstref_steal and skb_dstref_restore Going forward skb_dst_set will assert that skb dst_entry is empty during skb_dst_set to prevent potential leaks. There are few places that still manually manage dst_entry not using the helpers. Convert them to the following new helpers: - skb_dstref_steal that resets dst_entry and returns previous dst_entry value - skb_dstref_restore that restores dst_entry previously reset via skb_dstref_steal Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250818154032.3173645-2-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 14b923ddb6df..7538ca507ee9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1159,6 +1159,38 @@ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } +/** + * skb_dstref_steal() - return current dst_entry value and clear it + * @skb: buffer + * + * Resets skb dst_entry without adjusting its reference count. Useful in + * cases where dst_entry needs to be temporarily reset and restored. + * Note that the returned value cannot be used directly because it + * might contain SKB_DST_NOREF bit. + * + * When in doubt, prefer skb_dst_drop() over skb_dstref_steal() to correctly + * handle dst_entry reference counting. + * + * Returns: original skb dst_entry. + */ +static inline unsigned long skb_dstref_steal(struct sk_buff *skb) +{ + unsigned long refdst = skb->_skb_refdst; + + skb->_skb_refdst = 0; + return refdst; +} + +/** + * skb_dstref_restore() - restore skb dst_entry removed via skb_dstref_steal() + * @skb: buffer + * @refdst: dst entry from a call to skb_dstref_steal() + */ +static inline void skb_dstref_restore(struct sk_buff *skb, unsigned long refdst) +{ + skb->_skb_refdst = refdst; +} + /** * skb_dst_set - sets skb dst * @skb: buffer -- cgit v1.2.3 From a890348adcc993f48d1ae38f1174dc8de4c3c5ac Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 18 Aug 2025 08:40:32 -0700 Subject: net: Add skb_dst_check_unset To prevent dst_entry leaks, add warning when the non-NULL dst_entry is rewritten. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250818154032.3173645-8-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7538ca507ee9..ca8be45dd8be 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1159,6 +1159,12 @@ static inline struct dst_entry *skb_dst(const struct sk_buff *skb) return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK); } +static inline void skb_dst_check_unset(struct sk_buff *skb) +{ + DEBUG_NET_WARN_ON_ONCE((skb->_skb_refdst & SKB_DST_PTRMASK) && + !(skb->_skb_refdst & SKB_DST_NOREF)); +} + /** * skb_dstref_steal() - return current dst_entry value and clear it * @skb: buffer @@ -1188,6 +1194,7 @@ static inline unsigned long skb_dstref_steal(struct sk_buff *skb) */ static inline void skb_dstref_restore(struct sk_buff *skb, unsigned long refdst) { + skb_dst_check_unset(skb); skb->_skb_refdst = refdst; } @@ -1201,6 +1208,7 @@ static inline void skb_dstref_restore(struct sk_buff *skb, unsigned long refdst) */ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) { + skb_dst_check_unset(skb); skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst; } @@ -1217,6 +1225,7 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) */ static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) { + skb_dst_check_unset(skb); WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); skb->slow_gro |= !!dst; skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; -- cgit v1.2.3 From 68889dfd547bd8eabc5a98b58475d7b901cf5129 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:09 +0000 Subject: mptcp: Fix up subflow's memcg when CONFIG_SOCK_CGROUP_DATA=n. When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets sk->sk_memcg based on the current task. MPTCP subflow socket creation is triggered from userspace or an in-kernel worker. In the latter case, sk->sk_memcg is not what we want. So, we fix it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup(). Although the code is placed under #ifdef CONFIG_MEMCG, it is buried under #ifdef CONFIG_SOCK_CGROUP_DATA. The two configs are orthogonal. If CONFIG_MEMCG is enabled without CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged correctly. Let's move the code out of the wrong ifdef guard. Note that sk->sk_memcg is freed in sk_prot_free() and the parent sk holds the refcnt of memcg->css here, so we don't need to use css_tryget(). Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Matthieu Baerts (NGI0) Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 6 ++++++ mm/memcontrol.c | 13 +++++++++++++ net/mptcp/subflow.c | 11 +++-------- 3 files changed, 22 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..25921fbec685 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1604,6 +1604,7 @@ extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); +void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk); #if BITS_PER_LONG < 64 static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg) @@ -1661,6 +1662,11 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg); #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; static inline void mem_cgroup_sk_free(struct sock *sk) { }; + +static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) +{ +} + static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dd7fbed5a94..46713b9ece06 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5024,6 +5024,19 @@ void mem_cgroup_sk_free(struct sock *sk) css_put(&sk->sk_memcg->css); } +void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) +{ + if (sk->sk_memcg == newsk->sk_memcg) + return; + + mem_cgroup_sk_free(newsk); + + if (sk->sk_memcg) + css_get(&sk->sk_memcg->css); + + newsk->sk_memcg = sk->sk_memcg; +} + /** * mem_cgroup_charge_skmem - charge socket memory * @memcg: memcg to charge diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3f1b62a9fe88..c8a7e4b59db1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1717,19 +1717,14 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) /* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != cgroup_id(sock_cgroup_ptr(child_skcd))) { -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = parent->sk_memcg; - - mem_cgroup_sk_free(child); - if (memcg && css_tryget(&memcg->css)) - child->sk_memcg = memcg; -#endif /* CONFIG_MEMCG */ - cgroup_sk_free(child_skcd); *child_skcd = *parent_skcd; cgroup_sk_clone(child_skcd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ + + if (mem_cgroup_sockets_enabled) + mem_cgroup_sk_inherit(parent, child); } static void mptcp_subflow_ops_override(struct sock *ssk) -- cgit v1.2.3 From bb178c6bc08525d758a57775458d644304011bf8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:16 +0000 Subject: net-memcg: Pass struct sock to mem_cgroup_sk_(un)?charge(). We will store a flag in the lowest bit of sk->sk_memcg. Then, we cannot pass the raw pointer to mem_cgroup_charge_skmem() and mem_cgroup_uncharge_skmem(). Let's pass struct sock to the functions. While at it, they are renamed to match other functions starting with mem_cgroup_sk_. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-9-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 29 ++++++++++++++++++++++++----- mm/memcontrol.c | 18 +++++++++++------- net/core/sock.c | 24 +++++++++++------------- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp_output.c | 3 +-- 5 files changed, 48 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 25921fbec685..0837d3de3a68 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1596,15 +1596,16 @@ static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, - gfp_t gfp_mask); -void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) + void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk); +bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages, + gfp_t gfp_mask); +void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages); #if BITS_PER_LONG < 64 static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg) @@ -1660,13 +1661,31 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 -static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; -static inline void mem_cgroup_sk_free(struct sock *sk) { }; + +static inline void mem_cgroup_sk_alloc(struct sock *sk) +{ +} + +static inline void mem_cgroup_sk_free(struct sock *sk) +{ +} static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) { } +static inline bool mem_cgroup_sk_charge(const struct sock *sk, + unsigned int nr_pages, + gfp_t gfp_mask) +{ + return false; +} + +static inline void mem_cgroup_sk_uncharge(const struct sock *sk, + unsigned int nr_pages) +{ +} + static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d8a52d1d08fa..df3e9205c9e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5043,17 +5043,19 @@ void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) } /** - * mem_cgroup_charge_skmem - charge socket memory - * @memcg: memcg to charge + * mem_cgroup_sk_charge - charge socket memory + * @sk: socket in memcg to charge * @nr_pages: number of pages to charge * @gfp_mask: reclaim mode * * Charges @nr_pages to @memcg. Returns %true if the charge fit within * @memcg's configured limit, %false if it doesn't. */ -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, - gfp_t gfp_mask) +bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages, + gfp_t gfp_mask) { + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return memcg1_charge_skmem(memcg, nr_pages, gfp_mask); @@ -5066,12 +5068,14 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, } /** - * mem_cgroup_uncharge_skmem - uncharge socket memory - * @memcg: memcg to uncharge + * mem_cgroup_sk_uncharge - uncharge socket memory + * @sk: socket in memcg to uncharge * @nr_pages: number of pages to uncharge */ -void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages) { + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { memcg1_uncharge_skmem(memcg, nr_pages); return; diff --git a/net/core/sock.c b/net/core/sock.c index ab658fe23e1e..5537ca263858 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1041,8 +1041,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes) pages = sk_mem_pages(bytes); /* pre-charge to memcg */ - charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + charged = mem_cgroup_sk_charge(sk, pages, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; @@ -1054,7 +1054,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); - mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); + mem_cgroup_sk_uncharge(sk, pages); return -ENOMEM; } sk_forward_alloc_add(sk, pages << PAGE_SHIFT); @@ -3263,17 +3263,16 @@ EXPORT_SYMBOL(sk_wait_data); */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { + bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - struct mem_cgroup *memcg = NULL; - bool charged = false; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (mem_cgroup_sk_enabled(sk)) { - memcg = sk->sk_memcg; - charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); + memcg_enabled = true; + charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); if (!charged) goto suppress_allocation; } @@ -3347,10 +3346,9 @@ suppress_allocation: */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ - if (memcg && !charged) { - mem_cgroup_charge_skmem(memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); - } + if (memcg_enabled && !charged) + mem_cgroup_sk_charge(sk, amt, + gfp_memcg_charge() | __GFP_NOFAIL); return 1; } } @@ -3360,7 +3358,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); if (charged) - mem_cgroup_uncharge_skmem(memcg, amt); + mem_cgroup_sk_uncharge(sk, amt); return 0; } @@ -3399,7 +3397,7 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) sk_memory_allocated_sub(sk, amount); if (mem_cgroup_sk_enabled(sk)) - mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + mem_cgroup_sk_uncharge(sk, amount); if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 93569bbe00f4..0ef1eacd539d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -727,7 +727,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) } if (amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp); + mem_cgroup_sk_charge(newsk, amt, gfp); kmem_cache_charge(newsk, gfp); release_sock(newsk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 37fb320e6f70..dfbac0876d96 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3579,8 +3579,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk_memory_allocated_add(sk, amt); if (mem_cgroup_sk_enabled(sk)) - mem_cgroup_charge_skmem(sk->sk_memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); + mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. -- cgit v1.2.3 From b2ffd10cddde47cc6830e4981e91e3215def62b1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:17 +0000 Subject: net-memcg: Pass struct sock to mem_cgroup_sk_under_memory_pressure(). We will store a flag in the lowest bit of sk->sk_memcg. Then, we cannot pass the raw pointer to mem_cgroup_under_socket_pressure(). Let's pass struct sock to it and rename the function to match other functions starting with mem_cgroup_sk_. Note that the helper is moved to sock.h to use mem_cgroup_from_sk(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-10-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 18 ------------------ include/net/proto_memory.h | 2 +- include/net/sock.h | 22 ++++++++++++++++++++++ include/net/tcp.h | 2 +- 4 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0837d3de3a68..fb27e3d2fdac 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1642,19 +1642,6 @@ static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg) } #endif -static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) -{ -#ifdef CONFIG_MEMCG_V1 - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return !!memcg->tcpmem_pressure; -#endif /* CONFIG_MEMCG_V1 */ - do { - if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) - return true; - } while ((memcg = parent_mem_cgroup(memcg))); - return false; -} - int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); @@ -1686,11 +1673,6 @@ static inline void mem_cgroup_sk_uncharge(const struct sock *sk, { } -static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) -{ - return false; -} - static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h index 859e63de81c4..8e91a8fa31b5 100644 --- a/include/net/proto_memory.h +++ b/include/net/proto_memory.h @@ -32,7 +32,7 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) return false; if (mem_cgroup_sk_enabled(sk) && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) + mem_cgroup_sk_under_memory_pressure(sk)) return true; return !!READ_ONCE(*sk->sk_prot->memory_pressure); diff --git a/include/net/sock.h b/include/net/sock.h index 3efdf680401d..3bc4d566f7d0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2604,6 +2604,23 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk) { return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk); } + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + +#ifdef CONFIG_MEMCG_V1 + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return !!memcg->tcpmem_pressure; +#endif /* CONFIG_MEMCG_V1 */ + + do { + if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) + return true; + } while ((memcg = parent_mem_cgroup(memcg))); + + return false; +} #else static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) { @@ -2614,6 +2631,11 @@ static inline bool mem_cgroup_sk_enabled(const struct sock *sk) { return false; } + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + return false; +} #endif static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) diff --git a/include/net/tcp.h b/include/net/tcp.h index 9f01b6be6444..2936b8175950 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -276,7 +276,7 @@ extern unsigned long tcp_memory_pressure; static inline bool tcp_under_memory_pressure(const struct sock *sk) { if (mem_cgroup_sk_enabled(sk) && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) + mem_cgroup_sk_under_memory_pressure(sk)) return true; return READ_ONCE(tcp_memory_pressure); -- cgit v1.2.3 From 8fc6056dcf79937c46c97fa4996cda65956437a9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 7 Aug 2025 10:44:31 +0800 Subject: f2fs: fix to detect potential corrupted nid in free_nid_list As reported, on-disk footer.ino and footer.nid is the same and out-of-range, let's add sanity check on f2fs_alloc_nid() to detect any potential corruption in free_nid_list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 17 ++++++++++++++++- include/linux/f2fs_fs.h | 1 + 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27743b93e186..286e8a53f8e7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -27,12 +27,17 @@ static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; static struct kmem_cache *fsync_node_entry_slab; +static inline bool is_invalid_nid(struct f2fs_sb_info *sbi, nid_t nid) +{ + return nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid; +} + /* * Check whether the given nid is within node id range. */ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + if (unlikely(is_invalid_nid(sbi, nid))) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); @@ -2634,6 +2639,16 @@ retry: f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); + + if (unlikely(is_invalid_nid(sbi, i->nid))) { + spin_unlock(&nm_i->nid_list_lock); + f2fs_err(sbi, "Corrupted nid %u in free_nid_list", + i->nid); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_NID); + return false; + } + *nid = i->nid; __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 2f8b8bfc0e73..6afb4a13b81d 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -79,6 +79,7 @@ enum stop_cp_reason { STOP_CP_REASON_FLUSH_FAIL, STOP_CP_REASON_NO_SEGMENT, STOP_CP_REASON_CORRUPTED_FREE_BITMAP, + STOP_CP_REASON_CORRUPTED_NID, STOP_CP_REASON_MAX, }; -- cgit v1.2.3 From 5f8a4f34f6dcf4b64c3cbcfd4aa5a8d7dbcd268d Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 19 Aug 2025 09:39:15 -0700 Subject: bnxt_en: hsi: Update FW interface to 1.10.3.133 The major change is struct pcie_ctx_hw_stats_v2 which has new latency histograms added. Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250819163919.104075-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- include/linux/bnxt/hsi.h | 315 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 253 insertions(+), 62 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bnxt/hsi.h b/include/linux/bnxt/hsi.h index 549231703bce..8c5dac3b3ef3 100644 --- a/include/linux/bnxt/hsi.h +++ b/include/linux/bnxt/hsi.h @@ -276,6 +276,10 @@ struct cmd_nums { #define HWRM_REG_POWER_QUERY 0xe1UL #define HWRM_CORE_FREQUENCY_QUERY 0xe2UL #define HWRM_REG_POWER_HISTOGRAM 0xe3UL + #define HWRM_MONITOR_PAX_HISTOGRAM_START 0xe4UL + #define HWRM_MONITOR_PAX_HISTOGRAM_COLLECT 0xe5UL + #define HWRM_STAT_QUERY_ROCE_STATS 0xe6UL + #define HWRM_STAT_QUERY_ROCE_STATS_EXT 0xe7UL #define HWRM_WOL_FILTER_ALLOC 0xf0UL #define HWRM_WOL_FILTER_FREE 0xf1UL #define HWRM_WOL_FILTER_QCFG 0xf2UL @@ -407,9 +411,8 @@ struct cmd_nums { #define HWRM_FUNC_LAG_UPDATE 0x1b1UL #define HWRM_FUNC_LAG_FREE 0x1b2UL #define HWRM_FUNC_LAG_QCFG 0x1b3UL - #define HWRM_FUNC_TIMEDTX_PACING_RATE_ADD 0x1c2UL - #define HWRM_FUNC_TIMEDTX_PACING_RATE_DELETE 0x1c3UL - #define HWRM_FUNC_TIMEDTX_PACING_RATE_QUERY 0x1c4UL + #define HWRM_FUNC_TTX_PACING_RATE_PROF_QUERY 0x1c3UL + #define HWRM_FUNC_TTX_PACING_RATE_QUERY 0x1c4UL #define HWRM_SELFTEST_QLIST 0x200UL #define HWRM_SELFTEST_EXEC 0x201UL #define HWRM_SELFTEST_IRQ 0x202UL @@ -441,6 +444,7 @@ struct cmd_nums { #define HWRM_MFG_WRITE_CERT_NVM 0x21cUL #define HWRM_PORT_POE_CFG 0x230UL #define HWRM_PORT_POE_QCFG 0x231UL + #define HWRM_PORT_PHY_FDRSTAT 0x232UL #define HWRM_UDCC_QCAPS 0x258UL #define HWRM_UDCC_CFG 0x259UL #define HWRM_UDCC_QCFG 0x25aUL @@ -453,6 +457,8 @@ struct cmd_nums { #define HWRM_QUEUE_PFCWD_TIMEOUT_QCAPS 0x261UL #define HWRM_QUEUE_PFCWD_TIMEOUT_CFG 0x262UL #define HWRM_QUEUE_PFCWD_TIMEOUT_QCFG 0x263UL + #define HWRM_QUEUE_ADPTV_QOS_RX_QCFG 0x264UL + #define HWRM_QUEUE_ADPTV_QOS_TX_QCFG 0x265UL #define HWRM_TF 0x2bcUL #define HWRM_TF_VERSION_GET 0x2bdUL #define HWRM_TF_SESSION_OPEN 0x2c6UL @@ -551,6 +557,8 @@ struct cmd_nums { #define HWRM_DBG_COREDUMP_CAPTURE 0xff2cUL #define HWRM_DBG_PTRACE 0xff2dUL #define HWRM_DBG_SIM_CABLE_STATE 0xff2eUL + #define HWRM_DBG_TOKEN_QUERY_AUTH_IDS 0xff2fUL + #define HWRM_DBG_TOKEN_CFG 0xff30UL #define HWRM_NVM_GET_VPD_FIELD_INFO 0xffeaUL #define HWRM_NVM_SET_VPD_FIELD_INFO 0xffebUL #define HWRM_NVM_DEFRAG 0xffecUL @@ -632,8 +640,8 @@ struct hwrm_err_output { #define HWRM_VERSION_MAJOR 1 #define HWRM_VERSION_MINOR 10 #define HWRM_VERSION_UPDATE 3 -#define HWRM_VERSION_RSVD 97 -#define HWRM_VERSION_STR "1.10.3.97" +#define HWRM_VERSION_RSVD 133 +#define HWRM_VERSION_STR "1.10.3.133" /* hwrm_ver_get_input (size:192b/24B) */ struct hwrm_ver_get_input { @@ -688,6 +696,7 @@ struct hwrm_ver_get_output { #define VER_GET_RESP_DEV_CAPS_CFG_CFA_TRUFLOW_SUPPORTED 0x4000UL #define VER_GET_RESP_DEV_CAPS_CFG_SECURE_BOOT_CAPABLE 0x8000UL #define VER_GET_RESP_DEV_CAPS_CFG_SECURE_SOC_CAPABLE 0x10000UL + #define VER_GET_RESP_DEV_CAPS_CFG_DEBUG_TOKEN_SUPPORTED 0x20000UL u8 roce_fw_maj_8b; u8 roce_fw_min_8b; u8 roce_fw_bld_8b; @@ -872,7 +881,8 @@ struct hwrm_async_event_cmpl { #define ASYNC_EVENT_CMPL_EVENT_ID_REPRESENTOR_PAIR_CHANGE 0x4eUL #define ASYNC_EVENT_CMPL_EVENT_ID_VF_STAT_CHANGE 0x4fUL #define ASYNC_EVENT_CMPL_EVENT_ID_HOST_COREDUMP 0x50UL - #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID 0x51UL + #define ASYNC_EVENT_CMPL_EVENT_ID_ADPTV_QOS 0x51UL + #define ASYNC_EVENT_CMPL_EVENT_ID_MAX_RGTR_EVENT_ID 0x52UL #define ASYNC_EVENT_CMPL_EVENT_ID_FW_TRACE_MSG 0xfeUL #define ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR 0xffUL #define ASYNC_EVENT_CMPL_EVENT_ID_LAST ASYNC_EVENT_CMPL_EVENT_ID_HWRM_ERROR @@ -1344,7 +1354,8 @@ struct hwrm_async_event_cmpl_dbg_buf_producer { #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_CA2_TRACE 0x9UL #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_RIGP1_TRACE 0xaUL #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_AFM_KONG_HWRM_TRACE 0xbUL - #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_LAST ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_AFM_KONG_HWRM_TRACE + #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_ERR_QPC_TRACE 0xcUL + #define ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_LAST ASYNC_EVENT_CMPL_DBG_BUF_PRODUCER_EVENT_DATA1_TYPE_ERR_QPC_TRACE }; /* hwrm_async_event_cmpl_hwrm_error (size:128b/16B) */ @@ -1401,7 +1412,11 @@ struct hwrm_async_event_cmpl_error_report_base { #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD 0x4UL #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_THERMAL_THRESHOLD 0x5UL #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED 0x6UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_LAST ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUAL_DATA_RATE_NOT_SUPPORTED + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DUP_UDCC_SES 0x7UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DB_DROP 0x8UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_MD_TEMP 0x9UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_VNIC_ERR 0xaUL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_LAST ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_VNIC_ERR }; /* hwrm_async_event_cmpl_error_report_pause_storm (size:128b/16B) */ @@ -1914,6 +1929,12 @@ struct hwrm_func_qcaps_output { #define FUNC_QCAPS_RESP_FLAGS_EXT3_RX_RATE_PROFILE_SEL_SUPPORTED 0x8UL #define FUNC_QCAPS_RESP_FLAGS_EXT3_BIDI_OPT_SUPPORTED 0x10UL #define FUNC_QCAPS_RESP_FLAGS_EXT3_MIRROR_ON_ROCE_SUPPORTED 0x20UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_ROCE_VF_DYN_ALLOC_SUPPORT 0x40UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_CHANGE_UDP_SRCPORT_SUPPORT 0x80UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_PCIE_COMPLIANCE_SUPPORTED 0x100UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_MULTI_L2_DB_SUPPORTED 0x200UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_PCIE_SECURE_ATS_SUPPORTED 0x400UL + #define FUNC_QCAPS_RESP_FLAGS_EXT3_MBUF_STATS_SUPPORTED 0x800UL __le16 max_roce_vfs; __le16 max_crypto_rx_flow_filters; u8 unused_3[3]; @@ -1931,7 +1952,7 @@ struct hwrm_func_qcfg_input { u8 unused_0[6]; }; -/* hwrm_func_qcfg_output (size:1344b/168B) */ +/* hwrm_func_qcfg_output (size:1408b/176B) */ struct hwrm_func_qcfg_output { __le16 error_code; __le16 req_type; @@ -2124,7 +2145,43 @@ struct hwrm_func_qcfg_output { #define FUNC_QCFG_RESP_XID_PARTITION_CFG_TX_CK 0x1UL #define FUNC_QCFG_RESP_XID_PARTITION_CFG_RX_CK 0x2UL __le16 mirror_vnic_id; - u8 unused_7[7]; + u8 max_link_width; + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_UNKNOWN 0x0UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_X1 0x1UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_X2 0x2UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_X4 0x4UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_X8 0x8UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_X16 0x10UL + #define FUNC_QCFG_RESP_MAX_LINK_WIDTH_LAST FUNC_QCFG_RESP_MAX_LINK_WIDTH_X16 + u8 max_link_speed; + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_UNKNOWN 0x0UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_G1 0x1UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_G2 0x2UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_G3 0x3UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_G4 0x4UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_G5 0x5UL + #define FUNC_QCFG_RESP_MAX_LINK_SPEED_LAST FUNC_QCFG_RESP_MAX_LINK_SPEED_G5 + u8 negotiated_link_width; + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_UNKNOWN 0x0UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X1 0x1UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X2 0x2UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X4 0x4UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X8 0x8UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X16 0x10UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_LAST FUNC_QCFG_RESP_NEGOTIATED_LINK_WIDTH_X16 + u8 negotiated_link_speed; + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_UNKNOWN 0x0UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G1 0x1UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G2 0x2UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G3 0x3UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G4 0x4UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G5 0x5UL + #define FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_LAST FUNC_QCFG_RESP_NEGOTIATED_LINK_SPEED_G5 + u8 unused_7[2]; + u8 pcie_compliance; + u8 unused_8; + __le16 l2_db_multi_page_size_kb; + u8 unused_9[5]; u8 valid; }; @@ -2322,6 +2379,7 @@ struct hwrm_func_cfg_input { #define FUNC_CFG_REQ_ENABLES2_ROCE_MAX_GID_PER_VF 0x200UL #define FUNC_CFG_REQ_ENABLES2_XID_PARTITION_CFG 0x400UL #define FUNC_CFG_REQ_ENABLES2_PHYSICAL_SLOT_NUMBER 0x800UL + #define FUNC_CFG_REQ_ENABLES2_PCIE_COMPLIANCE 0x1000UL u8 port_kdnet_mode; #define FUNC_CFG_REQ_PORT_KDNET_MODE_DISABLED 0x0UL #define FUNC_CFG_REQ_PORT_KDNET_MODE_ENABLED 0x1UL @@ -2353,7 +2411,8 @@ struct hwrm_func_cfg_input { __le16 xid_partition_cfg; #define FUNC_CFG_REQ_XID_PARTITION_CFG_TX_CK 0x1UL #define FUNC_CFG_REQ_XID_PARTITION_CFG_RX_CK 0x2UL - __le16 unused_2; + u8 pcie_compliance; + u8 unused_2; }; /* hwrm_func_cfg_output (size:128b/16B) */ @@ -2370,11 +2429,41 @@ struct hwrm_func_cfg_output { struct hwrm_func_cfg_cmd_err { u8 code; #define FUNC_CFG_CMD_ERR_CODE_UNKNOWN 0x0UL - #define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_BW_RANGE 0x1UL - #define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_MORE_THAN_MAX 0x2UL - #define FUNC_CFG_CMD_ERR_CODE_PARTITION_MIN_BW_UNSUPPORTED 0x3UL - #define FUNC_CFG_CMD_ERR_CODE_PARTITION_BW_PERCENT 0x4UL - #define FUNC_CFG_CMD_ERR_CODE_LAST FUNC_CFG_CMD_ERR_CODE_PARTITION_BW_PERCENT + #define FUNC_CFG_CMD_ERR_CODE_PARTITION_BW_OUT_OF_RANGE 0x1UL + #define FUNC_CFG_CMD_ERR_CODE_NPAR_PARTITION_DOWN_FAILED 0x2UL + #define FUNC_CFG_CMD_ERR_CODE_TPID_SET_DFLT_VLAN_NOT_SET 0x3UL + #define FUNC_CFG_CMD_ERR_CODE_RES_ARRAY_ALLOC_FAILED 0x4UL + #define FUNC_CFG_CMD_ERR_CODE_TX_RING_ASSET_TEST_FAILED 0x5UL + #define FUNC_CFG_CMD_ERR_CODE_TX_RING_RES_UPDATE_FAILED 0x6UL + #define FUNC_CFG_CMD_ERR_CODE_APPLY_MAX_BW_FAILED 0x7UL + #define FUNC_CFG_CMD_ERR_CODE_ENABLE_EVB_FAILED 0x8UL + #define FUNC_CFG_CMD_ERR_CODE_RSS_CTXT_ASSET_TEST_FAILED 0x9UL + #define FUNC_CFG_CMD_ERR_CODE_RSS_CTXT_RES_UPDATE_FAILED 0xaUL + #define FUNC_CFG_CMD_ERR_CODE_CMPL_RING_ASSET_TEST_FAILED 0xbUL + #define FUNC_CFG_CMD_ERR_CODE_CMPL_RING_RES_UPDATE_FAILED 0xcUL + #define FUNC_CFG_CMD_ERR_CODE_NQ_ASSET_TEST_FAILED 0xdUL + #define FUNC_CFG_CMD_ERR_CODE_NQ_RES_UPDATE_FAILED 0xeUL + #define FUNC_CFG_CMD_ERR_CODE_RX_RING_ASSET_TEST_FAILED 0xfUL + #define FUNC_CFG_CMD_ERR_CODE_RX_RING_RES_UPDATE_FAILED 0x10UL + #define FUNC_CFG_CMD_ERR_CODE_VNIC_ASSET_TEST_FAILED 0x11UL + #define FUNC_CFG_CMD_ERR_CODE_VNIC_RES_UPDATE_FAILED 0x12UL + #define FUNC_CFG_CMD_ERR_CODE_FAILED_TO_START_STATS_THREAD 0x13UL + #define FUNC_CFG_CMD_ERR_CODE_RDMA_SRIOV_DISABLED 0x14UL + #define FUNC_CFG_CMD_ERR_CODE_TX_KTLS_DISABLED 0x15UL + #define FUNC_CFG_CMD_ERR_CODE_TX_KTLS_ASSET_TEST_FAILED 0x16UL + #define FUNC_CFG_CMD_ERR_CODE_TX_KTLS_RES_UPDATE_FAILED 0x17UL + #define FUNC_CFG_CMD_ERR_CODE_RX_KTLS_DISABLED 0x18UL + #define FUNC_CFG_CMD_ERR_CODE_RX_KTLS_ASSET_TEST_FAILED 0x19UL + #define FUNC_CFG_CMD_ERR_CODE_RX_KTLS_RES_UPDATE_FAILED 0x1aUL + #define FUNC_CFG_CMD_ERR_CODE_TX_QUIC_DISABLED 0x1bUL + #define FUNC_CFG_CMD_ERR_CODE_TX_QUIC_ASSET_TEST_FAILED 0x1cUL + #define FUNC_CFG_CMD_ERR_CODE_TX_QUIC_RES_UPDATE_FAILED 0x1dUL + #define FUNC_CFG_CMD_ERR_CODE_RX_QUIC_DISABLED 0x1eUL + #define FUNC_CFG_CMD_ERR_CODE_RX_QUIC_ASSET_TEST_FAILED 0x1fUL + #define FUNC_CFG_CMD_ERR_CODE_RX_QUIC_RES_UPDATE_FAILED 0x20UL + #define FUNC_CFG_CMD_ERR_CODE_INVALID_KDNET_MODE 0x21UL + #define FUNC_CFG_CMD_ERR_CODE_SCHQ_CFG_FAIL 0x22UL + #define FUNC_CFG_CMD_ERR_CODE_LAST FUNC_CFG_CMD_ERR_CODE_SCHQ_CFG_FAIL u8 unused_0[7]; }; @@ -3780,6 +3869,7 @@ struct hwrm_func_backing_store_cfg_v2_input { #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_CA2_TRACE 0x28UL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_RIGP1_TRACE 0x29UL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL + #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_ERR_QPC_TRACE 0x2bUL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_INVALID 0xffffUL #define FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_LAST FUNC_BACKING_STORE_CFG_V2_REQ_TYPE_INVALID __le16 instance; @@ -3865,6 +3955,7 @@ struct hwrm_func_backing_store_qcfg_v2_input { #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_CA2_TRACE 0x28UL #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_RIGP1_TRACE 0x29UL #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL + #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_ERR_QPC_TRACE 0x2bUL #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_INVALID 0xffffUL #define FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_LAST FUNC_BACKING_STORE_QCFG_V2_REQ_TYPE_INVALID __le16 instance; @@ -3904,6 +3995,7 @@ struct hwrm_func_backing_store_qcfg_v2_output { #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CA1_TRACE 0x27UL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_CA2_TRACE 0x28UL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_RIGP1_TRACE 0x29UL + #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_ERR_QPC_TRACE 0x2aUL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_INVALID 0xffffUL #define FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_LAST FUNC_BACKING_STORE_QCFG_V2_RESP_TYPE_INVALID __le16 instance; @@ -4027,6 +4119,7 @@ struct hwrm_func_backing_store_qcaps_v2_input { #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_CA2_TRACE 0x28UL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_RIGP1_TRACE 0x29UL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL + #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_ERR_QPC_TRACE 0x2bUL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_INVALID 0xffffUL #define FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_LAST FUNC_BACKING_STORE_QCAPS_V2_REQ_TYPE_INVALID u8 rsvd[6]; @@ -4070,6 +4163,7 @@ struct hwrm_func_backing_store_qcaps_v2_output { #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_CA2_TRACE 0x28UL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_RIGP1_TRACE 0x29UL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_AFM_KONG_HWRM_TRACE 0x2aUL + #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_ERR_QPC_TRACE 0x2bUL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_INVALID 0xffffUL #define FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_LAST FUNC_BACKING_STORE_QCAPS_V2_RESP_TYPE_INVALID __le16 entry_size; @@ -4216,6 +4310,10 @@ struct hwrm_port_phy_cfg_input { #define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_1XN_DISABLE 0x100000UL #define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_IEEE_ENABLE 0x200000UL #define PORT_PHY_CFG_REQ_FLAGS_FEC_RS272_IEEE_DISABLE 0x400000UL + #define PORT_PHY_CFG_REQ_FLAGS_LINK_TRAINING_ENABLE 0x800000UL + #define PORT_PHY_CFG_REQ_FLAGS_LINK_TRAINING_DISABLE 0x1000000UL + #define PORT_PHY_CFG_REQ_FLAGS_PRECODING_ENABLE 0x2000000UL + #define PORT_PHY_CFG_REQ_FLAGS_PRECODING_DISABLE 0x4000000UL __le32 enables; #define PORT_PHY_CFG_REQ_ENABLES_AUTO_MODE 0x1UL #define PORT_PHY_CFG_REQ_ENABLES_AUTO_DUPLEX 0x2UL @@ -4703,6 +4801,8 @@ struct hwrm_port_phy_qcfg_output { #define PORT_PHY_QCFG_RESP_OPTION_FLAGS_MEDIA_AUTO_DETECT 0x1UL #define PORT_PHY_QCFG_RESP_OPTION_FLAGS_SIGNAL_MODE_KNOWN 0x2UL #define PORT_PHY_QCFG_RESP_OPTION_FLAGS_SPEEDS2_SUPPORTED 0x4UL + #define PORT_PHY_QCFG_RESP_OPTION_FLAGS_LINK_TRAINING 0x8UL + #define PORT_PHY_QCFG_RESP_OPTION_FLAGS_PRECODING 0x10UL char phy_vendor_name[16]; char phy_vendor_partnumber[16]; __le16 support_pam4_speeds; @@ -4725,6 +4825,10 @@ struct hwrm_port_phy_qcfg_output { u8 link_down_reason; #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_RF 0x1UL #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_OTP_SPEED_VIOLATION 0x2UL + #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_CABLE_REMOVED 0x4UL + #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_MODULE_FAULT 0x8UL + #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_BMC_REQUEST 0x10UL + #define PORT_PHY_QCFG_RESP_LINK_DOWN_REASON_TX_LASER_DISABLED 0x20UL __le16 support_speeds2; #define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_1GB 0x1UL #define PORT_PHY_QCFG_RESP_SUPPORT_SPEEDS2_10GB 0x2UL @@ -5882,9 +5986,10 @@ struct hwrm_port_led_qcaps_output { #define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_SUPPORTED 0x8UL #define PORT_LED_QCAPS_RESP_LED0_STATE_CAPS_BLINK_ALT_SUPPORTED 0x10UL __le16 led0_color_caps; - #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_RSVD 0x1UL - #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_AMBER_SUPPORTED 0x2UL - #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_RSVD 0x1UL + #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_AMBER_SUPPORTED 0x2UL + #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED0_COLOR_CAPS_GRNAMB_SUPPORTED 0x8UL u8 led1_id; u8 led1_type; #define PORT_LED_QCAPS_RESP_LED1_TYPE_SPEED 0x0UL @@ -5900,9 +6005,10 @@ struct hwrm_port_led_qcaps_output { #define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_BLINK_SUPPORTED 0x8UL #define PORT_LED_QCAPS_RESP_LED1_STATE_CAPS_BLINK_ALT_SUPPORTED 0x10UL __le16 led1_color_caps; - #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_RSVD 0x1UL - #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_AMBER_SUPPORTED 0x2UL - #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_RSVD 0x1UL + #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_AMBER_SUPPORTED 0x2UL + #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED1_COLOR_CAPS_GRNAMB_SUPPORTED 0x8UL u8 led2_id; u8 led2_type; #define PORT_LED_QCAPS_RESP_LED2_TYPE_SPEED 0x0UL @@ -5918,9 +6024,10 @@ struct hwrm_port_led_qcaps_output { #define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_BLINK_SUPPORTED 0x8UL #define PORT_LED_QCAPS_RESP_LED2_STATE_CAPS_BLINK_ALT_SUPPORTED 0x10UL __le16 led2_color_caps; - #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_RSVD 0x1UL - #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_AMBER_SUPPORTED 0x2UL - #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_RSVD 0x1UL + #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_AMBER_SUPPORTED 0x2UL + #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED2_COLOR_CAPS_GRNAMB_SUPPORTED 0x8UL u8 led3_id; u8 led3_type; #define PORT_LED_QCAPS_RESP_LED3_TYPE_SPEED 0x0UL @@ -5936,9 +6043,10 @@ struct hwrm_port_led_qcaps_output { #define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_BLINK_SUPPORTED 0x8UL #define PORT_LED_QCAPS_RESP_LED3_STATE_CAPS_BLINK_ALT_SUPPORTED 0x10UL __le16 led3_color_caps; - #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_RSVD 0x1UL - #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_AMBER_SUPPORTED 0x2UL - #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_RSVD 0x1UL + #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_AMBER_SUPPORTED 0x2UL + #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_GREEN_SUPPORTED 0x4UL + #define PORT_LED_QCAPS_RESP_LED3_COLOR_CAPS_GRNAMB_SUPPORTED 0x8UL u8 unused_4[3]; u8 valid; }; @@ -7036,9 +7144,22 @@ struct hwrm_vnic_rss_cfg_output { /* hwrm_vnic_rss_cfg_cmd_err (size:64b/8B) */ struct hwrm_vnic_rss_cfg_cmd_err { u8 code; - #define VNIC_RSS_CFG_CMD_ERR_CODE_UNKNOWN 0x0UL - #define VNIC_RSS_CFG_CMD_ERR_CODE_INTERFACE_NOT_READY 0x1UL - #define VNIC_RSS_CFG_CMD_ERR_CODE_LAST VNIC_RSS_CFG_CMD_ERR_CODE_INTERFACE_NOT_READY + #define VNIC_RSS_CFG_CMD_ERR_CODE_UNKNOWN 0x0UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_INTERFACE_NOT_READY 0x1UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_UNABLE_TO_GET_RSS_CFG 0x2UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_HASH_TYPE_UNSUPPORTED 0x3UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_HASH_TYPE_ERR 0x4UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_HASH_MODE_FAIL 0x5UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_RING_GRP_TABLE_ALLOC_ERR 0x6UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_HASH_KEY_ALLOC_ERR 0x7UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_DMA_FAILED 0x8UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_RX_RING_ALLOC_ERR 0x9UL + #define VNIC_RSS_CFG_CMD_ERR_CODE_CMPL_RING_ALLOC_ERR 0xaUL + #define VNIC_RSS_CFG_CMD_ERR_CODE_HW_SET_RSS_FAILED 0xbUL + #define VNIC_RSS_CFG_CMD_ERR_CODE_CTX_INVALID 0xcUL + #define VNIC_RSS_CFG_CMD_ERR_CODE_VNIC_INVALID 0xdUL + #define VNIC_RSS_CFG_CMD_ERR_CODE_VNIC_RING_TABLE_PAIR_INVALID 0xeUL + #define VNIC_RSS_CFG_CMD_ERR_CODE_LAST VNIC_RSS_CFG_CMD_ERR_CODE_VNIC_RING_TABLE_PAIR_INVALID u8 unused_0[7]; }; @@ -7177,7 +7298,7 @@ struct hwrm_vnic_rss_cos_lb_ctx_free_output { u8 valid; }; -/* hwrm_ring_alloc_input (size:704b/88B) */ +/* hwrm_ring_alloc_input (size:768b/96B) */ struct hwrm_ring_alloc_input { __le16 req_type; __le16 cmpl_ring; @@ -7195,6 +7316,7 @@ struct hwrm_ring_alloc_input { #define RING_ALLOC_REQ_ENABLES_MPC_CHNLS_TYPE 0x400UL #define RING_ALLOC_REQ_ENABLES_STEERING_TAG_VALID 0x800UL #define RING_ALLOC_REQ_ENABLES_RX_RATE_PROFILE_VALID 0x1000UL + #define RING_ALLOC_REQ_ENABLES_DPI_VALID 0x2000UL u8 ring_type; #define RING_ALLOC_REQ_RING_TYPE_L2_CMPL 0x0UL #define RING_ALLOC_REQ_RING_TYPE_TX 0x1UL @@ -7287,6 +7409,8 @@ struct hwrm_ring_alloc_input { #define RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_LAST RING_ALLOC_REQ_RX_RATE_PROFILE_SEL_POLL_MODE u8 unused_4; __le64 cq_handle; + __le16 dpi; + __le16 unused_5[3]; }; /* hwrm_ring_alloc_output (size:128b/16B) */ @@ -7776,7 +7900,10 @@ struct hwrm_cfa_l2_set_rx_mask_cmd_err { u8 code; #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_UNKNOWN 0x0UL #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_NTUPLE_FILTER_CONFLICT_ERR 0x1UL - #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_LAST CFA_L2_SET_RX_MASK_CMD_ERR_CODE_NTUPLE_FILTER_CONFLICT_ERR + #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_MAX_VLAN_TAGS 0x2UL + #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_INVALID_VNIC_ID 0x3UL + #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_INVALID_ACTION 0x4UL + #define CFA_L2_SET_RX_MASK_CMD_ERR_CODE_LAST CFA_L2_SET_RX_MASK_CMD_ERR_CODE_INVALID_ACTION u8 unused_0[7]; }; @@ -8109,9 +8236,38 @@ struct hwrm_cfa_ntuple_filter_alloc_output { /* hwrm_cfa_ntuple_filter_alloc_cmd_err (size:64b/8B) */ struct hwrm_cfa_ntuple_filter_alloc_cmd_err { u8 code; - #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_UNKNOWN 0x0UL - #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_RX_MASK_VLAN_CONFLICT_ERR 0x1UL - #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_LAST CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_RX_MASK_VLAN_CONFLICT_ERR + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_UNKNOWN 0x0UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_ZERO_MAC 0x65UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_BC_MC_MAC 0x66UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_VNIC 0x67UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_PF_FID 0x68UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_L2_CTXT_ID 0x69UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_NULL_L2_CTXT_CFG 0x6aUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_NULL_L2_DATA_FLD 0x6bUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_CFA_LAYOUT 0x6cUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_L2_CTXT_ALLOC_FAIL 0x6dUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_ROCE_FLOW_ERR 0x6eUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_OWNER_FID 0x6fUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_ZERO_REF_CNT 0x70UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_FLOW_TYPE 0x71UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_IVLAN 0x72UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_MAX_VLAN_ID 0x73UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_TNL_REQ 0x74UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_L2_ADDR 0x75UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_L2_IVLAN 0x76UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_L3_ADDR 0x77UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_L3_ADDR_TYPE 0x78UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_T_L3_ADDR_TYPE 0x79UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_DST_VNIC_ID 0x7aUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_VNI 0x7bUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_DST_ID 0x7cUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_FAIL_ROCE_L2_FLOW 0x7dUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_NPAR_VLAN 0x7eUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_ATSP_ADD 0x7fUL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_DFLT_VLAN_FAIL 0x80UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_INVALID_L3_TYPE 0x81UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_VAL_FAIL_TNL_FLOW 0x82UL + #define CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_LAST CFA_NTUPLE_FILTER_ALLOC_CMD_ERR_CODE_VAL_FAIL_TNL_FLOW u8 unused_0[7]; }; @@ -9181,7 +9337,7 @@ struct pcie_ctx_hw_stats { __le64 pcie_recovery_histogram; }; -/* pcie_ctx_hw_stats_v2 (size:4096b/512B) */ +/* pcie_ctx_hw_stats_v2 (size:4544b/568B) */ struct pcie_ctx_hw_stats_v2 { __le64 pcie_pl_signal_integrity; __le64 pcie_dl_signal_integrity; @@ -9212,6 +9368,9 @@ struct pcie_ctx_hw_stats_v2 { __le64 pcie_other_packet_count; __le64 pcie_blocked_packet_count; __le64 pcie_cmpl_packet_count; + __le32 pcie_rd_latency_histogram[12]; + __le32 pcie_rd_latency_all_normal_count; + __le32 unused_2; }; /* hwrm_stat_generic_qstats_input (size:256b/32B) */ @@ -9406,7 +9565,8 @@ struct hwrm_struct_hdr { #define STRUCT_HDR_STRUCT_ID_MSIX_PER_VF 0xc8UL #define STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_COUNT 0x12cUL #define STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_BOUND 0x12dUL - #define STRUCT_HDR_STRUCT_ID_LAST STRUCT_HDR_STRUCT_ID_UDCC_RTT_BUCKET_BOUND + #define STRUCT_HDR_STRUCT_ID_DBG_TOKEN_CLAIMS 0x190UL + #define STRUCT_HDR_STRUCT_ID_LAST STRUCT_HDR_STRUCT_ID_DBG_TOKEN_CLAIMS __le16 len; u8 version; #define STRUCT_HDR_VERSION_0 0x0UL @@ -9459,11 +9619,13 @@ struct hwrm_fw_set_structured_data_output { /* hwrm_fw_set_structured_data_cmd_err (size:64b/8B) */ struct hwrm_fw_set_structured_data_cmd_err { u8 code; - #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_UNKNOWN 0x0UL - #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_HDR_CNT 0x1UL - #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_FMT 0x2UL - #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID 0x3UL - #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_LAST FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_UNKNOWN 0x0UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_HDR_CNT 0x1UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_FMT 0x2UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_BAD_ID 0x3UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_ALREADY_ADDED 0x4UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_INST_IN_PROG 0x5UL + #define FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_LAST FW_SET_STRUCTURED_DATA_CMD_ERR_CODE_INST_IN_PROG u8 unused_0[7]; }; @@ -9487,7 +9649,9 @@ struct hwrm_fw_get_structured_data_input { #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_PEER 0x201UL #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_NON_TPMR_OPERATIONAL 0x202UL #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_HOST_OPERATIONAL 0x300UL - #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_LAST FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_HOST_OPERATIONAL + #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_CLAIMS_SUPPORTED 0x320UL + #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_CLAIMS_ACTIVE 0x321UL + #define FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_LAST FW_GET_STRUCTURED_DATA_REQ_SUBTYPE_CLAIMS_ACTIVE u8 count; u8 unused_0; }; @@ -10172,7 +10336,8 @@ struct hwrm_dbg_log_buffer_flush_input { #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_CA2_TRACE 0x9UL #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_RIGP1_TRACE 0xaUL #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_AFM_KONG_HWRM_TRACE 0xbUL - #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_LAST DBG_LOG_BUFFER_FLUSH_REQ_TYPE_AFM_KONG_HWRM_TRACE + #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_ERR_QPC_TRACE 0xcUL + #define DBG_LOG_BUFFER_FLUSH_REQ_TYPE_LAST DBG_LOG_BUFFER_FLUSH_REQ_TYPE_ERR_QPC_TRACE u8 unused_1[2]; __le32 flags; #define DBG_LOG_BUFFER_FLUSH_REQ_FLAGS_FLUSH_ALL_BUFFERS 0x1UL @@ -10295,10 +10460,15 @@ struct hwrm_nvm_write_output { /* hwrm_nvm_write_cmd_err (size:64b/8B) */ struct hwrm_nvm_write_cmd_err { u8 code; - #define NVM_WRITE_CMD_ERR_CODE_UNKNOWN 0x0UL - #define NVM_WRITE_CMD_ERR_CODE_FRAG_ERR 0x1UL - #define NVM_WRITE_CMD_ERR_CODE_NO_SPACE 0x2UL - #define NVM_WRITE_CMD_ERR_CODE_LAST NVM_WRITE_CMD_ERR_CODE_NO_SPACE + #define NVM_WRITE_CMD_ERR_CODE_UNKNOWN 0x0UL + #define NVM_WRITE_CMD_ERR_CODE_FRAG_ERR 0x1UL + #define NVM_WRITE_CMD_ERR_CODE_NO_SPACE 0x2UL + #define NVM_WRITE_CMD_ERR_CODE_WRITE_FAILED 0x3UL + #define NVM_WRITE_CMD_ERR_CODE_REQD_ERASE_FAILED 0x4UL + #define NVM_WRITE_CMD_ERR_CODE_VERIFY_FAILED 0x5UL + #define NVM_WRITE_CMD_ERR_CODE_INVALID_HEADER 0x6UL + #define NVM_WRITE_CMD_ERR_CODE_UPDATE_DIGEST_FAILED 0x7UL + #define NVM_WRITE_CMD_ERR_CODE_LAST NVM_WRITE_CMD_ERR_CODE_UPDATE_DIGEST_FAILED u8 unused_0[7]; }; @@ -10438,7 +10608,11 @@ struct hwrm_nvm_get_dev_info_output { __le16 srt2_fw_minor; __le16 srt2_fw_build; __le16 srt2_fw_patch; - u8 unused_0[7]; + u8 security_soc_fw_major; + u8 security_soc_fw_minor; + u8 security_soc_fw_build; + u8 security_soc_fw_patch; + u8 unused_0[3]; u8 valid; }; @@ -10568,7 +10742,9 @@ struct hwrm_nvm_install_update_cmd_err { #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_SPACE 0x2UL #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_ANTI_ROLLBACK 0x3UL #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_VOLTREG_SUPPORT 0x4UL - #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_LAST NVM_INSTALL_UPDATE_CMD_ERR_CODE_NO_VOLTREG_SUPPORT + #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_DEFRAG_FAILED 0x5UL + #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_UNKNOWN_DIR_ERR 0x6UL + #define NVM_INSTALL_UPDATE_CMD_ERR_CODE_LAST NVM_INSTALL_UPDATE_CMD_ERR_CODE_UNKNOWN_DIR_ERR u8 unused_0[7]; }; @@ -10591,7 +10767,8 @@ struct hwrm_nvm_get_variable_input { __le16 index_2; __le16 index_3; u8 flags; - #define NVM_GET_VARIABLE_REQ_FLAGS_FACTORY_DFLT 0x1UL + #define NVM_GET_VARIABLE_REQ_FLAGS_FACTORY_DFLT 0x1UL + #define NVM_GET_VARIABLE_REQ_FLAGS_VALIDATE_OPT_VALUE 0x2UL u8 unused_0; }; @@ -10606,18 +10783,25 @@ struct hwrm_nvm_get_variable_output { #define NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_0 0x0UL #define NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_FFFF 0xffffUL #define NVM_GET_VARIABLE_RESP_OPTION_NUM_LAST NVM_GET_VARIABLE_RESP_OPTION_NUM_RSVD_FFFF - u8 unused_0[3]; + u8 flags; + #define NVM_GET_VARIABLE_RESP_FLAGS_VALIDATE_OPT_VALUE 0x1UL + u8 unused_0[2]; u8 valid; }; /* hwrm_nvm_get_variable_cmd_err (size:64b/8B) */ struct hwrm_nvm_get_variable_cmd_err { u8 code; - #define NVM_GET_VARIABLE_CMD_ERR_CODE_UNKNOWN 0x0UL - #define NVM_GET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL - #define NVM_GET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR 0x2UL - #define NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT 0x3UL - #define NVM_GET_VARIABLE_CMD_ERR_CODE_LAST NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT + #define NVM_GET_VARIABLE_CMD_ERR_CODE_UNKNOWN 0x0UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR 0x2UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT 0x3UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_INDEX_INVALID 0x4UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_ACCESS_DENIED 0x5UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_CB_FAILED 0x6UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_INVALID_DATA_LEN 0x7UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_NO_MEM 0x8UL + #define NVM_GET_VARIABLE_CMD_ERR_CODE_LAST NVM_GET_VARIABLE_CMD_ERR_CODE_NO_MEM u8 unused_0[7]; }; @@ -10667,10 +10851,17 @@ struct hwrm_nvm_set_variable_output { /* hwrm_nvm_set_variable_cmd_err (size:64b/8B) */ struct hwrm_nvm_set_variable_cmd_err { u8 code; - #define NVM_SET_VARIABLE_CMD_ERR_CODE_UNKNOWN 0x0UL - #define NVM_SET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL - #define NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR 0x2UL - #define NVM_SET_VARIABLE_CMD_ERR_CODE_LAST NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR + #define NVM_SET_VARIABLE_CMD_ERR_CODE_UNKNOWN 0x0UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_VAR_NOT_EXIST 0x1UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_CORRUPT_VAR 0x2UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_LEN_TOO_SHORT 0x3UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_ACTION_NOT_SUPPORTED 0x4UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_INDEX_INVALID 0x5UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_ACCESS_DENIED 0x6UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_CB_FAILED 0x7UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_INVALID_DATA_LEN 0x8UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_NO_MEM 0x9UL + #define NVM_SET_VARIABLE_CMD_ERR_CODE_LAST NVM_SET_VARIABLE_CMD_ERR_CODE_NO_MEM u8 unused_0[7]; }; -- cgit v1.2.3 From 5a774b64cd6a008f016119782a5d3f30ed0bf3b7 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 18 Aug 2025 09:51:21 +0200 Subject: net: phy: micrel: Add support for lan8842 The LAN8842 is a low-power, single port triple-speed (10BASE-T/ 100BASE-TX/ 1000BASE-T) ethernet physical layer transceiver (PHY) that supports transmission and reception of data on standard CAT-5, as well as CAT-5e and CAT-6, Unshielded Twisted Pair (UTP) cables. The LAN8842 supports industry-standard SGMII (Serial Gigabit Media Independent Interface) providing chip-to-chip connection to a Gigabit Ethernet MAC using a single serialized link (differential pair) in each direction. There are 2 variants of the lan8842. The one that supports timestamping (lan8842) and one that doesn't have timestamping (lan8832). Signed-off-by: Horatiu Vultur Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250818075121.1298170-5-horatiu.vultur@microchip.com Signed-off-by: Paolo Abeni --- drivers/net/phy/micrel.c | 209 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/micrel_phy.h | 1 + 2 files changed, 210 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index c621ed465d2e..04bd744920b0 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -448,6 +448,17 @@ struct kszphy_priv { struct kszphy_phy_stats phy_stats; }; +struct lan8842_phy_stats { + u64 rx_packets; + u64 rx_errors; + u64 tx_packets; + u64 tx_errors; +}; + +struct lan8842_priv { + struct lan8842_phy_stats phy_stats; +}; + static const struct kszphy_type lan8814_type = { .led_mode_reg = ~LAN8814_LED_CTRL_1, .cable_diag_reg = LAN8814_CABLE_DIAG, @@ -5768,6 +5779,188 @@ static int ksz9131_resume(struct phy_device *phydev) return kszphy_resume(phydev); } +#define LAN8842_SELF_TEST 14 /* 0x0e */ +#define LAN8842_SELF_TEST_RX_CNT_ENA BIT(8) +#define LAN8842_SELF_TEST_TX_CNT_ENA BIT(4) + +static int lan8842_probe(struct phy_device *phydev) +{ + struct lan8842_priv *priv; + int ret; + + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; + + /* Similar to lan8814 this PHY has a pin which needs to be pulled down + * to enable to pass any traffic through it. Therefore use the same + * function as lan8814 + */ + ret = lan8814_release_coma_mode(phydev); + if (ret) + return ret; + + /* Enable to count the RX and TX packets */ + ret = lanphy_write_page_reg(phydev, LAN8814_PAGE_PCS_DIGITAL, + LAN8842_SELF_TEST, + LAN8842_SELF_TEST_RX_CNT_ENA | + LAN8842_SELF_TEST_TX_CNT_ENA); + if (ret < 0) + return ret; + + return 0; +} + +static int lan8842_config_init(struct phy_device *phydev) +{ + int ret; + + /* Reset the PHY */ + ret = lanphy_modify_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, + LAN8814_QSGMII_SOFT_RESET, + LAN8814_QSGMII_SOFT_RESET_BIT, + LAN8814_QSGMII_SOFT_RESET_BIT); + if (ret < 0) + return ret; + + /* To allow the PHY to control the LEDs the GPIOs of the PHY should have + * a function mode and not the GPIO. Apparently by default the value is + * GPIO and not function even though the datasheet it says that it is + * function. Therefore set this value. + */ + return lanphy_write_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, + LAN8814_GPIO_EN2, 0); +} + +#define LAN8842_INTR_CTRL_REG 52 /* 0x34 */ + +static int lan8842_config_intr(struct phy_device *phydev) +{ + int err; + + lanphy_write_page_reg(phydev, LAN8814_PAGE_COMMON_REGS, + LAN8842_INTR_CTRL_REG, + LAN8814_INTR_CTRL_REG_INTR_ENABLE); + + /* enable / disable interrupts */ + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { + err = lan8814_ack_interrupt(phydev); + if (err) + return err; + + err = phy_write(phydev, LAN8814_INTC, LAN8814_INT_LINK); + } else { + err = phy_write(phydev, LAN8814_INTC, 0); + if (err) + return err; + + err = lan8814_ack_interrupt(phydev); + } + + return err; +} + +static unsigned int lan8842_inband_caps(struct phy_device *phydev, + phy_interface_t interface) +{ + /* Inband configuration can be enabled or disabled using the registers + * PCS1G_ANEG_CONFIG. + */ + return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE; +} + +static int lan8842_config_inband(struct phy_device *phydev, unsigned int modes) +{ + bool enable; + + if (modes == LINK_INBAND_DISABLE) + enable = false; + else + enable = true; + + /* Disable or enable in-band autoneg with PCS Host side + * It has the same address as lan8814 + */ + return lanphy_modify_page_reg(phydev, LAN8814_PAGE_PORT_REGS, + LAN8814_QSGMII_PCS1G_ANEG_CONFIG, + LAN8814_QSGMII_PCS1G_ANEG_CONFIG_ANEG_ENA, + enable ? LAN8814_QSGMII_PCS1G_ANEG_CONFIG_ANEG_ENA : 0); +} + +static irqreturn_t lan8842_handle_interrupt(struct phy_device *phydev) +{ + int ret = IRQ_NONE; + int irq_status; + + irq_status = phy_read(phydev, LAN8814_INTS); + if (irq_status < 0) { + phy_error(phydev); + return IRQ_NONE; + } + + if (irq_status & LAN8814_INT_LINK) { + phy_trigger_machine(phydev); + ret = IRQ_HANDLED; + } + + return ret; +} + +static u64 lan8842_get_stat(struct phy_device *phydev, int count, int *regs) +{ + u64 ret = 0; + int val; + + for (int j = 0; j < count; ++j) { + val = lanphy_read_page_reg(phydev, LAN8814_PAGE_PCS_DIGITAL, + regs[j]); + if (val < 0) + return U64_MAX; + + ret <<= 16; + ret += val; + } + return ret; +} + +static int lan8842_update_stats(struct phy_device *phydev) +{ + struct lan8842_priv *priv = phydev->priv; + int rx_packets_regs[] = {88, 61, 60}; + int rx_errors_regs[] = {63, 62}; + int tx_packets_regs[] = {89, 85, 84}; + int tx_errors_regs[] = {87, 86}; + + priv->phy_stats.rx_packets = lan8842_get_stat(phydev, + ARRAY_SIZE(rx_packets_regs), + rx_packets_regs); + priv->phy_stats.rx_errors = lan8842_get_stat(phydev, + ARRAY_SIZE(rx_errors_regs), + rx_errors_regs); + priv->phy_stats.tx_packets = lan8842_get_stat(phydev, + ARRAY_SIZE(tx_packets_regs), + tx_packets_regs); + priv->phy_stats.tx_errors = lan8842_get_stat(phydev, + ARRAY_SIZE(tx_errors_regs), + tx_errors_regs); + + return 0; +} + +static void lan8842_get_phy_stats(struct phy_device *phydev, + struct ethtool_eth_phy_stats *eth_stats, + struct ethtool_phy_stats *stats) +{ + struct lan8842_priv *priv = phydev->priv; + + stats->rx_packets = priv->phy_stats.rx_packets; + stats->rx_errors = priv->phy_stats.rx_errors; + stats->tx_packets = priv->phy_stats.tx_packets; + stats->tx_errors = priv->phy_stats.tx_errors; +} + static struct phy_driver ksphy_driver[] = { { PHY_ID_MATCH_MODEL(PHY_ID_KS8737), @@ -5987,6 +6180,21 @@ static struct phy_driver ksphy_driver[] = { .resume = lan8841_resume, .cable_test_start = lan8814_cable_test_start, .cable_test_get_status = ksz886x_cable_test_get_status, +}, { + PHY_ID_MATCH_MODEL(PHY_ID_LAN8842), + .name = "Microchip LAN8842 Gigabit PHY", + .flags = PHY_POLL_CABLE_TEST, + .driver_data = &lan8814_type, + .probe = lan8842_probe, + .config_init = lan8842_config_init, + .config_intr = lan8842_config_intr, + .inband_caps = lan8842_inband_caps, + .config_inband = lan8842_config_inband, + .handle_interrupt = lan8842_handle_interrupt, + .get_phy_stats = lan8842_get_phy_stats, + .update_stats = lan8842_update_stats, + .cable_test_start = lan8814_cable_test_start, + .cable_test_get_status = ksz886x_cable_test_get_status, }, { PHY_ID_MATCH_MODEL(PHY_ID_KSZ9131), .name = "Microchip KSZ9131 Gigabit PHY", @@ -6082,6 +6290,7 @@ static const struct mdio_device_id __maybe_unused micrel_tbl[] = { { PHY_ID_MATCH_MODEL(PHY_ID_LAN8814) }, { PHY_ID_MATCH_MODEL(PHY_ID_LAN8804) }, { PHY_ID_MATCH_MODEL(PHY_ID_LAN8841) }, + { PHY_ID_MATCH_MODEL(PHY_ID_LAN8842) }, { } }; diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 9af01bdd86d2..ca691641788b 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -32,6 +32,7 @@ #define PHY_ID_LAN8814 0x00221660 #define PHY_ID_LAN8804 0x00221670 #define PHY_ID_LAN8841 0x00221650 +#define PHY_ID_LAN8842 0x002216C0 #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 -- cgit v1.2.3 From 6c9468aad215a198742c8375b0415e42521c905c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:54 -0700 Subject: fscrypt: replace raw loads of info pointer with helper function Add and use a helper function fscrypt_get_inode_info_raw(). It loads an inode's fscrypt info pointer using a raw dereference, which is appropriate when the caller knows the key setup already happened. This eliminates most occurrences of inode::i_crypt_info in the source, in preparation for replacing that with a filesystem-specific field. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-2-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/crypto/bio.c | 2 +- fs/crypto/crypto.c | 14 ++++++++------ fs/crypto/fname.c | 11 ++++++----- fs/crypto/hooks.c | 2 +- fs/crypto/inline_crypt.c | 12 +++++++----- fs/crypto/policy.c | 7 ++++--- include/linux/fscrypt.h | 16 ++++++++++++++++ 7 files changed, 43 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 486fcb2ecf13..0d746de4cd10 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -113,7 +113,7 @@ out: int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index b6ccab524fde..07f9cbfe3ea4 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -173,7 +173,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs, gfp_t gfp_flags) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; @@ -232,8 +232,9 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_ENCRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); @@ -255,7 +256,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { const struct inode *inode = folio->mapping->host; - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + @@ -305,8 +306,9 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, { if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) return -EOPNOTSUPP; - return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT, - lblk_num, page, page, len, offs); + return fscrypt_crypt_data_unit(fscrypt_get_inode_info_raw(inode), + FS_DECRYPT, lblk_num, page, page, len, + offs); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index f9f6713e144f..fb77ad1ca74a 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -94,7 +94,7 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -138,7 +138,7 @@ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { - const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm; SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); union fscrypt_iv iv; @@ -274,8 +274,9 @@ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { - return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, - orig_len, max_len, + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); + + return __fscrypt_fname_encrypted_size(&ci->ci_policy, orig_len, max_len, encrypted_len_ret); } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); @@ -543,7 +544,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_inode_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(dir); WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index e0b32ac841f7..7a5d4c168c49 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -199,7 +199,7 @@ int fscrypt_prepare_setflags(struct inode *inode, err = fscrypt_require_key(inode); if (err) return err; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version != FSCRYPT_POLICY_V2) return -EINVAL; mk = ci->ci_master_key; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index caaff809765b..5dee7c498bc8 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -263,7 +263,7 @@ int fscrypt_derive_sw_secret(struct super_block *sb, bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { - return inode->i_crypt_info->ci_inlinecrypt; + return fscrypt_get_inode_info_raw(inode)->ci_inlinecrypt; } EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto); @@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, if (!fscrypt_inode_uses_inline_crypto(inode)) return; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); fscrypt_generate_dun(ci, first_lblk, dun); bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); @@ -385,22 +385,24 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) { const struct bio_crypt_ctx *bc = bio->bi_crypt_context; + const struct fscrypt_inode_info *ci; u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; if (!!bc != fscrypt_inode_uses_inline_crypto(inode)) return false; if (!bc) return true; + ci = fscrypt_get_inode_info_raw(inode); /* * Comparing the key pointers is good enough, as all I/O for each key * uses the same pointer. I.e., there's currently no need to support * merging requests where the keys are the same but the pointers differ. */ - if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key) + if (bc->bc_key != ci->ci_enc_key.blk_key) return false; - fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun); + fscrypt_generate_dun(ci, next_lblk, next_dun); return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); @@ -502,7 +504,7 @@ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) if (nr_blocks <= 1) return nr_blocks; - ci = inode->i_crypt_info; + ci = fscrypt_get_inode_info_raw(inode); if (!(fscrypt_policy_flags(&ci->ci_policy) & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) return nr_blocks; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6ad30ae07c06..9d51f3500de3 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -727,7 +727,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) err = fscrypt_require_key(dir); if (err) return ERR_PTR(err); - return &dir->i_crypt_info->ci_policy; + return &fscrypt_get_inode_info_raw(dir)->ci_policy; } return fscrypt_get_dummy_policy(dir->i_sb); @@ -746,7 +746,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); @@ -771,7 +771,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); */ int fscrypt_set_context(struct inode *inode, void *fs_data) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ctxsize; @@ -783,6 +783,7 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) * This may be the first time the inode number is available, so do any * delayed key setup that requires the inode number. */ + ci = fscrypt_get_inode_info_raw(inode); if (ci->ci_policy.version == FSCRYPT_POLICY_V2 && (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) fscrypt_hash_inode_number(ci, ci->ci_master_key); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 10dd161690a2..23c5198612d1 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -195,6 +195,22 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +/* + * Load the inode's fscrypt info pointer, using a raw dereference. Since this + * uses a raw dereference with no memory barrier, it is appropriate to use only + * when the caller knows the inode's key setup already happened, resulting in + * non-NULL fscrypt info. E.g., the file contents en/decryption functions use + * this, since fscrypt_file_open() set up the key. + */ +static inline struct fscrypt_inode_info * +fscrypt_get_inode_info_raw(const struct inode *inode) +{ + struct fscrypt_inode_info *ci = inode->i_crypt_info; + + VFS_WARN_ON_ONCE(ci == NULL); + return ci; +} + static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { -- cgit v1.2.3 From 93221de31a8df6710e02328f82dc68d7ab4ad9e6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:56:55 -0700 Subject: fscrypt: add support for info in fs-specific part of inode Add an inode_info_offs field to struct fscrypt_operations, and update fs/crypto/ to support it. When set to a nonzero value, it specifies the offset to the fscrypt_inode_info pointer within the filesystem-specific part of the inode structure, to be used instead of inode::i_crypt_info. Since this makes inode::i_crypt_info no longer necessarily used, update comments that mentioned it. This is a prerequisite for a later commit that removes inode::i_crypt_info, saving memory and improving cache efficiency with filesystems that don't support fscrypt. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-3-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/crypto/fscrypt_private.h | 4 ++-- fs/crypto/keysetup.c | 43 ++++++++++++++++++++++++++----------------- include/linux/fscrypt.h | 22 ++++++++++++++++++---- 3 files changed, 46 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index d8b485b9881c..245e6b84aa17 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -249,8 +249,8 @@ struct fscrypt_prepared_key { * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is - * allocated and stored in ->i_crypt_info. Once created, it remains until the - * inode is evicted. + * allocated and a pointer to it is stored in the file's in-memory inode. Once + * created, it remains until the inode is evicted. */ struct fscrypt_inode_info { diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 4f3b9ecbfe4e..c1f85715c276 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -642,15 +642,16 @@ fscrypt_setup_encryption_info(struct inode *inode, goto out; /* - * For existing inodes, multiple tasks may race to set ->i_crypt_info. - * So use cmpxchg_release(). This pairs with the smp_load_acquire() in - * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with - * a RELEASE barrier so that other tasks can ACQUIRE it. + * For existing inodes, multiple tasks may race to set the inode's + * fscrypt info pointer. So use cmpxchg_release(). This pairs with the + * smp_load_acquire() in fscrypt_get_inode_info(). I.e., publish the + * pointer with a RELEASE barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { + if (cmpxchg_release(fscrypt_inode_info_addr(inode), NULL, crypt_info) == + NULL) { /* - * We won the race and set ->i_crypt_info to our crypt_info. - * Now link it into the master key's inode list. + * We won the race and set the inode's fscrypt info to our + * crypt_info. Now link it into the master key's inode list. */ if (mk) { crypt_info->ci_master_key = mk; @@ -681,13 +682,13 @@ out: * %false unless the operation being performed is needed in * order for files (or directories) to be deleted. * - * Set up ->i_crypt_info, if it hasn't already been done. + * Set up the inode's encryption key, if it hasn't already been done. * - * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So + * Note: unless the key setup was already done, this isn't %GFP_NOFS-safe. So * generally this shouldn't be called from within a filesystem transaction. * - * Return: 0 if ->i_crypt_info was set or was already set, *or* if the - * encryption key is unavailable. (Use fscrypt_has_encryption_key() to + * Return: 0 if the key is now set up, *or* if it couldn't be set up because the + * needed master key is absent. (Use fscrypt_has_encryption_key() to * distinguish these cases.) Also can return another -errno code. */ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) @@ -741,9 +742,9 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * ->i_ino doesn't need to be set yet. * @encrypt_ret: (output) set to %true if the new inode will be encrypted * - * If the directory is encrypted, set up its ->i_crypt_info in preparation for + * If the directory is encrypted, set up its encryption key in preparation for * encrypting the name of the new file. Also, if the new inode will be - * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true. + * encrypted, set up its encryption key too and set *encrypt_ret=true. * * This isn't %GFP_NOFS-safe, and therefore it should be called before starting * any filesystem transaction to create the inode. For this reason, ->i_ino @@ -752,8 +753,8 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported) * This doesn't persist the new inode's encryption context. That still needs to * be done later by calling fscrypt_set_context(). * - * Return: 0 on success, -ENOKEY if the encryption key is missing, or another - * -errno code + * Return: 0 on success, -ENOKEY if a key needs to be set up for @dir or @inode + * but the needed master key is absent, or another -errno code */ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode, bool *encrypt_ret) @@ -800,8 +801,16 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); */ void fscrypt_put_encryption_info(struct inode *inode) { - put_crypt_info(inode->i_crypt_info); - inode->i_crypt_info = NULL; + /* + * Ideally we'd start with a lightweight IS_ENCRYPTED() check here + * before proceeding to retrieve and check the pointer. However, during + * inode creation, the fscrypt_inode_info is set before S_ENCRYPTED. If + * an error occurs, it needs to be cleaned up regardless. + */ + struct fscrypt_inode_info **ci_addr = fscrypt_inode_info_addr(inode); + + put_crypt_info(*ci_addr); + *ci_addr = NULL; } EXPORT_SYMBOL(fscrypt_put_encryption_info); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 23c5198612d1..d7ff53accbfe 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -61,6 +61,12 @@ struct fscrypt_name { /* Crypto operations for filesystems */ struct fscrypt_operations { + /* + * The offset of the pointer to struct fscrypt_inode_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /* * If set, then fs/crypto/ will allocate a global bounce page pool the @@ -195,6 +201,14 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +static inline struct fscrypt_inode_info ** +fscrypt_inode_info_addr(const struct inode *inode) +{ + if (inode->i_sb->s_cop->inode_info_offs == 0) + return (struct fscrypt_inode_info **)&inode->i_crypt_info; + return (void *)inode + inode->i_sb->s_cop->inode_info_offs; +} + /* * Load the inode's fscrypt info pointer, using a raw dereference. Since this * uses a raw dereference with no memory barrier, it is appropriate to use only @@ -205,7 +219,7 @@ int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, static inline struct fscrypt_inode_info * fscrypt_get_inode_info_raw(const struct inode *inode) { - struct fscrypt_inode_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = *fscrypt_inode_info_addr(inode); VFS_WARN_ON_ONCE(ci == NULL); return ci; @@ -216,11 +230,11 @@ fscrypt_get_inode_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info(). - * I.e., another task may publish ->i_crypt_info concurrently, executing - * a RELEASE barrier. We need to use smp_load_acquire() here to safely + * I.e., another task may publish the fscrypt info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_crypt_info); + return smp_load_acquire(fscrypt_inode_info_addr(inode)); } /** -- cgit v1.2.3 From ab90c2d2476c4dd6deddd089c7e83b858d135783 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:00 -0700 Subject: fs: remove inode::i_crypt_info Now that all fscrypt-capable filesystems store the pointer to fscrypt_inode_info in the filesystem-specific part of the inode structure, inode::i_crypt_info is no longer needed. Update fscrypt_inode_info_addr() to no longer support the fallback to inode::i_crypt_info. Finally, remove inode::i_crypt_info itself along with the now-unnecessary forward declaration of fscrypt_inode_info. The end result of the migration to the filesystem-specific pointer is memory savings on CONFIG_FS_ENCRYPTION=y kernels for all filesystems that don't support fscrypt. Specifically, their in-memory inodes are now smaller by the size of a pointer: either 4 or 8 bytes. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-8-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ----- include/linux/fscrypt.h | 8 ++++++-- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..1dafa18169be 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -72,7 +72,6 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; -struct fscrypt_inode_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; @@ -780,10 +779,6 @@ struct inode { struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif -#ifdef CONFIG_FS_ENCRYPTION - struct fscrypt_inode_info *i_crypt_info; -#endif - #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index d7ff53accbfe..516aba5b858b 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -201,11 +201,15 @@ struct fscrypt_operations { int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags); +/* + * Returns the address of the fscrypt info pointer within the + * filesystem-specific part of the inode. (To save memory on filesystems that + * don't support fscrypt, a field in 'struct inode' itself is no longer used.) + */ static inline struct fscrypt_inode_info ** fscrypt_inode_info_addr(const struct inode *inode) { - if (inode->i_sb->s_cop->inode_info_offs == 0) - return (struct fscrypt_inode_info **)&inode->i_crypt_info; + VFS_WARN_ON_ONCE(inode->i_sb->s_cop->inode_info_offs == 0); return (void *)inode + inode->i_sb->s_cop->inode_info_offs; } -- cgit v1.2.3 From 2a7349add18e5915cd87251af5f98db1772b6131 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:01 -0700 Subject: fsverity: add support for info in fs-specific part of inode Add an inode_info_offs field to struct fsverity_operations, and update fs/verity/ to support it. When set to a nonzero value, it specifies the offset to the fsverity_info pointer within the filesystem-specific part of the inode structure, to be used instead of inode::i_verity_info. Since this makes inode::i_verity_info no longer necessarily used, update comments that mentioned it. This is a prerequisite for a later commit that removes inode::i_verity_info, saving memory and improving cache efficiency on filesystems that don't support fsverity. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-9-ebiggers@kernel.org Signed-off-by: Christian Brauner --- fs/verity/enable.c | 6 +++--- fs/verity/fsverity_private.h | 9 +++++---- fs/verity/open.c | 23 ++++++++++++----------- fs/verity/verify.c | 2 +- include/linux/fsverity.h | 44 ++++++++++++++++++++++++++++++++++---------- 5 files changed, 55 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 503268cf4296..89eccc4becf9 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -284,9 +284,9 @@ static int enable_verity(struct file *filp, /* Successfully enabled verity */ /* - * Readers can start using ->i_verity_info immediately, so it - * can't be rolled back once set. So don't set it until just - * after the filesystem has successfully enabled verity. + * Readers can start using the inode's verity info immediately, + * so it can't be rolled back once set. So don't set it until + * just after the filesystem has successfully enabled verity. */ fsverity_set_info(inode, vi); } diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 5fe854a5b9ad..bc1d887c532e 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -63,10 +63,11 @@ struct merkle_tree_params { * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated - * and stored in ->i_verity_info; it remains until the inode is evicted. It - * caches information about the Merkle tree that's needed to efficiently verify - * data read from the file. It also caches the file digest. The Merkle tree - * pages themselves are not cached here, but the filesystem may cache them. + * and a pointer to it is stored in the file's in-memory inode. It remains + * until the inode is evicted. It caches information about the Merkle tree + * that's needed to efficiently verify data read from the file. It also caches + * the file digest. The Merkle tree pages themselves are not cached here, but + * the filesystem may cache them. */ struct fsverity_info { struct merkle_tree_params tree_params; diff --git a/fs/verity/open.c b/fs/verity/open.c index c561e130cd0c..77b1c977af02 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -244,17 +244,17 @@ fail: void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* - * Multiple tasks may race to set ->i_verity_info, so use - * cmpxchg_release(). This pairs with the smp_load_acquire() in - * fsverity_get_info(). I.e., here we publish ->i_verity_info with a - * RELEASE barrier so that other tasks can ACQUIRE it. + * Multiple tasks may race to set the inode's verity info pointer, so + * use cmpxchg_release(). This pairs with the smp_load_acquire() in + * fsverity_get_info(). I.e., publish the pointer with a RELEASE + * barrier so that other tasks can ACQUIRE it. */ - if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { - /* Lost the race, so free the fsverity_info we allocated. */ + if (cmpxchg_release(fsverity_info_addr(inode), NULL, vi) != NULL) { + /* Lost the race, so free the verity info we allocated. */ fsverity_free_info(vi); /* - * Afterwards, the caller may access ->i_verity_info directly, - * so make sure to ACQUIRE the winning fsverity_info. + * Afterwards, the caller may access the inode's verity info + * directly, so make sure to ACQUIRE the winning verity info. */ (void)fsverity_get_info(inode); } @@ -350,7 +350,6 @@ int fsverity_get_descriptor(struct inode *inode, return 0; } -/* Ensure the inode has an ->i_verity_info */ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); @@ -395,8 +394,10 @@ EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); void __fsverity_cleanup_inode(struct inode *inode) { - fsverity_free_info(inode->i_verity_info); - inode->i_verity_info = NULL; + struct fsverity_info **vi_addr = fsverity_info_addr(inode); + + fsverity_free_info(*vi_addr); + *vi_addr = NULL; } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); diff --git a/fs/verity/verify.c b/fs/verity/verify.c index a1f00c3fd3b2..affc307eb6a6 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -245,7 +245,7 @@ verify_data_blocks(struct folio *data_folio, size_t len, size_t offset, unsigned long max_ra_pages) { struct inode *inode = data_folio->mapping->host; - struct fsverity_info *vi = inode->i_verity_info; + struct fsverity_info *vi = *fsverity_info_addr(inode); const unsigned int block_size = vi->tree_params.block_size; u64 pos = (u64)data_folio->index << PAGE_SHIFT; diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 1eb7eae580be..e0f132cb7839 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -28,6 +28,12 @@ /* Verity operations for filesystems */ struct fsverity_operations { + /** + * The offset of the pointer to struct fsverity_info in the + * filesystem-specific part of the inode, relative to the beginning of + * the common part of the inode (the 'struct inode'). + */ + ptrdiff_t inode_info_offs; /** * Begin enabling verity on the given file. @@ -124,15 +130,33 @@ struct fsverity_operations { #ifdef CONFIG_FS_VERITY +static inline struct fsverity_info ** +fsverity_info_addr(const struct inode *inode) +{ + if (inode->i_sb->s_vop->inode_info_offs == 0) + return (struct fsverity_info **)&inode->i_verity_info; + return (void *)inode + inode->i_sb->s_vop->inode_info_offs; +} + static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* - * Pairs with the cmpxchg_release() in fsverity_set_info(). - * I.e., another task may publish ->i_verity_info concurrently, - * executing a RELEASE barrier. We need to use smp_load_acquire() here - * to safely ACQUIRE the memory the other task published. + * Since this function can be called on inodes belonging to filesystems + * that don't support fsverity at all, and fsverity_info_addr() doesn't + * work on such filesystems, we have to start with an IS_VERITY() check. + * Checking IS_VERITY() here is also useful to minimize the overhead of + * fsverity_active() on non-verity files. + */ + if (!IS_VERITY(inode)) + return NULL; + + /* + * Pairs with the cmpxchg_release() in fsverity_set_info(). I.e., + * another task may publish the inode's verity info concurrently, + * executing a RELEASE barrier. Use smp_load_acquire() here to safely + * ACQUIRE the memory the other task published. */ - return smp_load_acquire(&inode->i_verity_info); + return smp_load_acquire(fsverity_info_addr(inode)); } /* enable.c */ @@ -156,11 +180,11 @@ void __fsverity_cleanup_inode(struct inode *inode); * fsverity_cleanup_inode() - free the inode's verity info, if present * @inode: an inode being evicted * - * Filesystems must call this on inode eviction to free ->i_verity_info. + * Filesystems must call this on inode eviction to free the inode's verity info. */ static inline void fsverity_cleanup_inode(struct inode *inode) { - if (inode->i_verity_info) + if (*fsverity_info_addr(inode)) __fsverity_cleanup_inode(inode); } @@ -267,12 +291,12 @@ static inline bool fsverity_verify_page(struct page *page) * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * - * This checks whether ->i_verity_info has been set. + * This checks whether the inode's verity info has been set. * * Filesystems call this from ->readahead() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with - * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) + * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before the verity info.) * * Return: true if reads need to go through fs-verity, otherwise false */ @@ -287,7 +311,7 @@ static inline bool fsverity_active(const struct inode *inode) * @filp: the struct file being set up * * When opening a verity file, deny the open if it is for writing. Otherwise, - * set up the inode's ->i_verity_info if not already done. + * set up the inode's verity info if not already done. * * When combined with fscrypt, this must be called after fscrypt_file_open(). * Otherwise, we won't have the key set up to decrypt the verity metadata. -- cgit v1.2.3 From 818c659ac164e4e4639ceaedaccbdfebb1ef63b5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:05 -0700 Subject: fs: remove inode::i_verity_info Now that all fsverity-capable filesystems store the pointer to fsverity_info in the filesystem-specific part of the inode structure, inode::i_verity_info is no longer needed. Update fsverity_info_addr() to no longer support the fallback to inode::i_verity_info. Finally, remove inode::i_verity_info itself, and move the forward declaration of struct fsverity_info from fs.h (which no longer needs it) to fsverity.h. The end result of the migration to the filesystem-specific pointer is memory savings on CONFIG_FS_VERITY=y kernels for all filesystems that don't support fsverity. Specifically, their in-memory inodes are now smaller by the size of a pointer: either 4 or 8 bytes. Co-developed-by: Christian Brauner Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-13-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ----- include/linux/fsverity.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1dafa18169be..12ecc6b0e6f9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -73,7 +73,6 @@ struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_operations; -struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; @@ -779,10 +778,6 @@ struct inode { struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif -#ifdef CONFIG_FS_VERITY - struct fsverity_info *i_verity_info; -#endif - void *i_private; /* fs or device private pointer */ } __randomize_layout; diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index e0f132cb7839..844f7b8b56bb 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -26,6 +26,8 @@ /* Arbitrary limit to bound the kmalloc() size. Can be changed. */ #define FS_VERITY_MAX_DESCRIPTOR_SIZE 16384 +struct fsverity_info; + /* Verity operations for filesystems */ struct fsverity_operations { /** @@ -130,11 +132,15 @@ struct fsverity_operations { #ifdef CONFIG_FS_VERITY +/* + * Returns the address of the verity info pointer within the filesystem-specific + * part of the inode. (To save memory on filesystems that don't support + * fsverity, a field in 'struct inode' itself is no longer used.) + */ static inline struct fsverity_info ** fsverity_info_addr(const struct inode *inode) { - if (inode->i_sb->s_vop->inode_info_offs == 0) - return (struct fsverity_info **)&inode->i_verity_info; + VFS_WARN_ON_ONCE(inode->i_sb->s_vop->inode_info_offs == 0); return (void *)inode + inode->i_sb->s_vop->inode_info_offs; } -- cgit v1.2.3 From 8a3d00dde63a339d31d1fdeead24ddfd4d459c70 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 10 Aug 2025 00:57:06 -0700 Subject: fsverity: check IS_VERITY() in fsverity_cleanup_inode() Since getting the address of the fsverity_info has gotten a bit more expensive, make fsverity_cleanup_inode() check for IS_VERITY() instead. This avoids adding more overhead to non-verity files. This assumes that verity info is never set when !IS_VERITY(), which is currently true, but add a VFS_WARN_ON_ONCE() that asserts that. (This of course defeats the optimization, but only when CONFIG_VFS_DEBUG=y.) Signed-off-by: Eric Biggers Link: https://lore.kernel.org/20250810075706.172910-14-ebiggers@kernel.org Signed-off-by: Christian Brauner --- include/linux/fsverity.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h index 844f7b8b56bb..5bc7280425a7 100644 --- a/include/linux/fsverity.h +++ b/include/linux/fsverity.h @@ -190,8 +190,15 @@ void __fsverity_cleanup_inode(struct inode *inode); */ static inline void fsverity_cleanup_inode(struct inode *inode) { - if (*fsverity_info_addr(inode)) + /* + * Only IS_VERITY() inodes can have verity info, so start by checking + * for IS_VERITY() (which is faster than retrieving the pointer to the + * verity info). This minimizes overhead for non-verity inodes. + */ + if (IS_VERITY(inode)) __fsverity_cleanup_inode(inode); + else + VFS_WARN_ON_ONCE(*fsverity_info_addr(inode) != NULL); } /* read_metadata.c */ -- cgit v1.2.3 From 0f07b7919d679050d354d3279faa74bdc7ce17a0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:12 +0200 Subject: uprobes: Rename arch_uretprobe_trampoline function We are about to add uprobe trampoline, so cleaning up the namespace. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-3-jolsa@kernel.org --- arch/x86/kernel/uprobes.c | 2 +- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6d383839e839..77050e5a4680 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -338,7 +338,7 @@ extern u8 uretprobe_trampoline_entry[]; extern u8 uretprobe_trampoline_end[]; extern u8 uretprobe_syscall_check[]; -void *arch_uprobe_trampoline(unsigned long *psize) +void *arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; struct pt_regs *regs = task_pt_regs(current); diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c39094..01112f27cd21 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -224,7 +224,7 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); extern void uprobe_handle_trampoline(struct pt_regs *regs); -extern void *arch_uprobe_trampoline(unsigned long *psize); +extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); #else /* !CONFIG_UPROBES */ struct uprobes_state { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1cbfe3cfe573..dd4dd156c956 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1726,7 +1726,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) return ret; } -void * __weak arch_uprobe_trampoline(unsigned long *psize) +void * __weak arch_uretprobe_trampoline(unsigned long *psize) { static uprobe_opcode_t insn = UPROBE_SWBP_INSN; @@ -1758,7 +1758,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - insns = arch_uprobe_trampoline(&insns_size); + insns = arch_uretprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); if (!xol_add_vma(mm, area)) -- cgit v1.2.3 From 82afdd05a16a424409682e06a53d6afcda038d30 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:13 +0200 Subject: uprobes: Make copy_from_page global Making copy_from_page global and adding uprobe prefix. Adding the uprobe prefix to copy_to_page as well for symmetry. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-4-jolsa@kernel.org --- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 01112f27cd21..7447e15559b8 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -226,6 +226,7 @@ extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, extern void uprobe_handle_trampoline(struct pt_regs *regs); extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); +extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index dd4dd156c956..f993a3422083 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) +void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); @@ -205,7 +205,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { @@ -1051,7 +1051,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, if (IS_ERR(page)) return PTR_ERR(page); - copy_from_page(page, offset, insn, nbytes); + uprobe_copy_from_page(page, offset, insn, nbytes); put_page(page); return 0; @@ -1397,7 +1397,7 @@ struct uprobe *uprobe_register(struct inode *inode, return ERR_PTR(-EINVAL); /* - * This ensures that copy_from_page(), copy_to_page() and + * This ensures that uprobe_copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary. */ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) @@ -2393,7 +2393,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ -- cgit v1.2.3 From 33d7b2beaf34a3c0f6406bc76f6e1b1755150ad9 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:14 +0200 Subject: uprobes: Add uprobe_write function Adding uprobe_write function that does what uprobe_write_opcode did so far, but allows to pass verify callback function that checks the memory location before writing the opcode. It will be used in following changes to implement specific checking logic for instruction update. The uprobe_write_opcode now calls uprobe_write with verify_opcode as the verify callback. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Masami Hiramatsu (Google) Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-5-jolsa@kernel.org --- include/linux/uprobes.h | 5 +++++ kernel/events/uprobes.c | 14 ++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 7447e15559b8..e13382054435 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -187,6 +187,9 @@ struct uprobes_state { struct xol_area *xol_area; }; +typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, + uprobe_opcode_t *opcode); + extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -195,6 +198,8 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, + uprobe_opcode_t opcode, uprobe_write_verify_t verify); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f993a3422083..838ac40e91e6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -399,7 +399,7 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, return identical; } -static int __uprobe_write_opcode(struct vm_area_struct *vma, +static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, unsigned long opcode_vaddr, uprobe_opcode_t opcode) { @@ -487,6 +487,12 @@ remap: */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode) +{ + return uprobe_write(auprobe, vma, opcode_vaddr, opcode, verify_opcode); +} + +int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, uprobe_write_verify_t verify) { const unsigned long vaddr = opcode_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -509,7 +515,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really - * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() + * required. Use FOLL_SPLIT_PMD, because __uprobe_write() * cannot deal with PMDs yet. */ if (is_register) @@ -521,7 +527,7 @@ retry: goto out; folio = page_folio(page); - ret = verify_opcode(page, opcode_vaddr, &opcode); + ret = verify(page, opcode_vaddr, &opcode); if (ret <= 0) { folio_put(folio); goto out; @@ -560,7 +566,7 @@ retry: /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, opcode_vaddr, opcode); folio_walk_end(&fw, vma); } -- cgit v1.2.3 From f8b7c528b4fb7018d12b6bb63bb52576cfc73697 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:15 +0200 Subject: uprobes: Add nbytes argument to uprobe_write Adding nbytes argument to uprobe_write and related functions as preparation for writing whole instructions in following changes. Also renaming opcode arguments to insn, which seems to fit better. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-6-jolsa@kernel.org --- include/linux/uprobes.h | 4 ++-- kernel/events/uprobes.c | 26 ++++++++++++++------------ 2 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index e13382054435..147c4a0a1af9 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -188,7 +188,7 @@ struct uprobes_state { }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, - uprobe_opcode_t *opcode); + uprobe_opcode_t *insn, int nbytes); extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -199,7 +199,7 @@ extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t opcode, uprobe_write_verify_t verify); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 838ac40e91e6..c133fd4b492d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src kunmap_atomic(kaddr); } -static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) +static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, + int nbytes) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -208,7 +209,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); - if (is_swbp_insn(new_opcode)) { + if (is_swbp_insn(insn)) { if (is_swbp) /* register: already installed? */ return 0; } else { @@ -401,10 +402,10 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long opcode_vaddr, uprobe_opcode_t opcode) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(&opcode); + const unsigned long vaddr = insn_vaddr & PAGE_MASK; + const bool is_register = !!is_swbp_insn(insn); bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -429,7 +430,7 @@ static int __uprobe_write(struct vm_area_struct *vma, */ flush_cache_page(vma, vaddr, pte_pfn(fw->pte)); fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep); - copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); + copy_to_page(fw->page, insn_vaddr, insn, nbytes); /* * When unregistering, we may only zap a PTE if uffd is disabled and @@ -488,13 +489,14 @@ remap: int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, uprobe_opcode_t opcode) { - return uprobe_write(auprobe, vma, opcode_vaddr, opcode, verify_opcode); + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode, uprobe_write_verify_t verify) + const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + uprobe_write_verify_t verify) { - const unsigned long vaddr = opcode_vaddr & PAGE_MASK; + const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; int ret, is_register, ref_ctr_updated = 0; @@ -504,7 +506,7 @@ int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, struct folio *folio; struct page *page; - is_register = is_swbp_insn(&opcode); + is_register = is_swbp_insn(insn); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -527,7 +529,7 @@ retry: goto out; folio = page_folio(page); - ret = verify(page, opcode_vaddr, &opcode); + ret = verify(page, insn_vaddr, insn, nbytes); if (ret <= 0) { folio_put(folio); goto out; @@ -566,7 +568,7 @@ retry: /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write(vma, &fw, folio, opcode_vaddr, opcode); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes); folio_walk_end(&fw, vma); } -- cgit v1.2.3 From ec46350fe1e2338f42ee84974c36b25afe8ba53a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:16 +0200 Subject: uprobes: Add is_register argument to uprobe_write and uprobe_write_opcode The uprobe_write has special path to restore the original page when we write original instruction back. This happens when uprobe_write detects that we want to write anything else but breakpoint instruction. Moving the detection away and passing it to uprobe_write as argument, so it's possible to write different instructions (other than just breakpoint and rest). Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-7-jolsa@kernel.org --- arch/arm/probes/uprobes/core.c | 2 +- include/linux/uprobes.h | 5 +++-- kernel/events/uprobes.c | 21 +++++++++++---------- 3 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/probes/uprobes/core.c b/arch/arm/probes/uprobes/core.c index 885e0c5e8c20..3d96fb41d624 100644 --- a/arch/arm/probes/uprobes/core.c +++ b/arch/arm/probes/uprobes/core.c @@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - __opcode_to_mem_arm(auprobe->bpinsn)); + __opcode_to_mem_arm(auprobe->bpinsn), true); } bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 147c4a0a1af9..518b26756469 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -197,9 +197,10 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn); extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); -extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t); +extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, + bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c133fd4b492d..955e5ed3e383 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -402,10 +402,10 @@ static bool orig_page_is_identical(struct vm_area_struct *vma, static int __uprobe_write(struct vm_area_struct *vma, struct folio_walk *fw, struct folio *folio, - unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes) + unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, + bool is_register) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; - const bool is_register = !!is_swbp_insn(insn); bool pmd_mappable; /* For now, we'll only handle PTE-mapped folios. */ @@ -487,26 +487,27 @@ remap: * Return 0 (success) or a negative errno. */ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, - const unsigned long opcode_vaddr, uprobe_opcode_t opcode) + const unsigned long opcode_vaddr, uprobe_opcode_t opcode, + bool is_register) { - return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, verify_opcode); + return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, + verify_opcode, is_register); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify) + uprobe_write_verify_t verify, bool is_register) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; - int ret, is_register, ref_ctr_updated = 0; + int ret, ref_ctr_updated = 0; unsigned int gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page; - is_register = is_swbp_insn(insn); uprobe = container_of(auprobe, struct uprobe, arch); if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) @@ -568,7 +569,7 @@ retry: /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page) - ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes); + ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register); folio_walk_end(&fw, vma); } @@ -610,7 +611,7 @@ out: int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { - return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN); + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true); } /** @@ -626,7 +627,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr) { return uprobe_write_opcode(auprobe, vma, vaddr, - *(uprobe_opcode_t *)&auprobe->insn); + *(uprobe_opcode_t *)&auprobe->insn, false); } /* uprobe should have guaranteed positive refcount */ -- cgit v1.2.3 From 18a111256a0b4fedfe47101f084441a84d7e357a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:17 +0200 Subject: uprobes: Add do_ref_ctr argument to uprobe_write function Making update_ref_ctr call in uprobe_write conditional based on do_ref_ctr argument. This way we can use uprobe_write for instruction update without doing ref_ctr_offset update. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu (Google) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250720112133.244369-8-jolsa@kernel.org --- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 518b26756469..5080619560d4 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -200,7 +200,7 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 955e5ed3e383..da2b3d0deab6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -491,12 +491,12 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, bool is_register) { return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, - verify_opcode, is_register); + verify_opcode, is_register, true /* do_update_ref_ctr */); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify, bool is_register) + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -537,7 +537,7 @@ retry: } /* We are going to replace instruction, update ref_ctr. */ - if (!ref_ctr_updated && uprobe->ref_ctr_offset) { + if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) { ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) { folio_put(folio); @@ -589,7 +589,7 @@ retry: out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && ref_ctr_updated) + if (do_update_ref_ctr && ret < 0 && ref_ctr_updated) update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ -- cgit v1.2.3 From 91440ff4cafad4c86322a612e523f7f021a493e7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:18 +0200 Subject: uprobes/x86: Add mapping for optimized uprobe trampolines Adding support to add special mapping for user space trampoline with following functions: uprobe_trampoline_get - find or add uprobe_trampoline uprobe_trampoline_put - remove or destroy uprobe_trampoline The user space trampoline is exported as arch specific user space special mapping through tramp_mapping, which is initialized in following changes with new uprobe syscall. The uprobe trampoline needs to be callable/reachable from the probed address, so while searching for available address we use is_reachable_by_call function to decide if the uprobe trampoline is callable from the probe address. All uprobe_trampoline objects are stored in uprobes_state object and are cleaned up when the process mm_struct goes down. Adding new arch hooks for that, because this change is x86_64 specific. Locking is provided by callers in following changes. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-9-jolsa@kernel.org --- arch/x86/kernel/uprobes.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/uprobes.h | 6 ++ kernel/events/uprobes.c | 10 ++++ kernel/fork.c | 1 + 4 files changed, 161 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 77050e5a4680..6c4dcbdd0c3c 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -608,6 +608,150 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) *sr = utask->autask.saved_scratch_register; } } + +static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + +static struct page *tramp_mapping_pages[2] __ro_after_init; + +static struct vm_special_mapping tramp_mapping = { + .name = "[uprobes-trampoline]", + .mremap = tramp_mremap, + .pages = tramp_mapping_pages, +}; + +struct uprobe_trampoline { + struct hlist_node node; + unsigned long vaddr; +}; + +static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr) +{ + long delta = (long)(vaddr + 5 - vtramp); + + return delta >= INT_MIN && delta <= INT_MAX; +} + +static unsigned long find_nearest_trampoline(unsigned long vaddr) +{ + struct vm_unmapped_area_info info = { + .length = PAGE_SIZE, + .align_mask = ~PAGE_MASK, + }; + unsigned long low_limit, high_limit; + unsigned long low_tramp, high_tramp; + unsigned long call_end = vaddr + 5; + + if (check_add_overflow(call_end, INT_MIN, &low_limit)) + low_limit = PAGE_SIZE; + + high_limit = call_end + INT_MAX; + + /* Search up from the caller address. */ + info.low_limit = call_end; + info.high_limit = min(high_limit, TASK_SIZE); + high_tramp = vm_unmapped_area(&info); + + /* Search down from the caller address. */ + info.low_limit = max(low_limit, PAGE_SIZE); + info.high_limit = call_end; + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + low_tramp = vm_unmapped_area(&info); + + if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp)) + return -ENOMEM; + if (IS_ERR_VALUE(high_tramp)) + return low_tramp; + if (IS_ERR_VALUE(low_tramp)) + return high_tramp; + + /* Return address that's closest to the caller address. */ + if (call_end - low_tramp < high_tramp - call_end) + return low_tramp; + return high_tramp; +} + +static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) +{ + struct pt_regs *regs = task_pt_regs(current); + struct mm_struct *mm = current->mm; + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + + if (!user_64bit_mode(regs)) + return NULL; + + vaddr = find_nearest_trampoline(vaddr); + if (IS_ERR_VALUE(vaddr)) + return NULL; + + tramp = kzalloc(sizeof(*tramp), GFP_KERNEL); + if (unlikely(!tramp)) + return NULL; + + tramp->vaddr = vaddr; + vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE, + VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO, + &tramp_mapping); + if (IS_ERR(vma)) { + kfree(tramp); + return NULL; + } + return tramp; +} + +__maybe_unused +static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) +{ + struct uprobes_state *state = ¤t->mm->uprobes_state; + struct uprobe_trampoline *tramp = NULL; + + if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE) + return NULL; + + hlist_for_each_entry(tramp, &state->head_tramps, node) { + if (is_reachable_by_call(tramp->vaddr, vaddr)) { + *new = false; + return tramp; + } + } + + tramp = create_uprobe_trampoline(vaddr); + if (!tramp) + return NULL; + + *new = true; + hlist_add_head(&tramp->node, &state->head_tramps); + return tramp; +} + +static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp) +{ + /* + * We do not unmap and release uprobe trampoline page itself, + * because there's no easy way to make sure none of the threads + * is still inside the trampoline. + */ + hlist_del(&tramp->node); + kfree(tramp); +} + +void arch_uprobe_init_state(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps); +} + +void arch_uprobe_clear_state(struct mm_struct *mm) +{ + struct uprobes_state *state = &mm->uprobes_state; + struct uprobe_trampoline *tramp; + struct hlist_node *n; + + hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) + destroy_uprobe_trampoline(tramp); +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 5080619560d4..b40d33aae016 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -17,6 +17,7 @@ #include #include #include +#include struct uprobe; struct vm_area_struct; @@ -185,6 +186,9 @@ struct xol_area; struct uprobes_state { struct xol_area *xol_area; +#ifdef CONFIG_X86_64 + struct hlist_head head_tramps; +#endif }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, @@ -233,6 +237,8 @@ extern void uprobe_handle_trampoline(struct pt_regs *regs); extern void *arch_uretprobe_trampoline(unsigned long *psize); extern unsigned long uprobe_get_trampoline_vaddr(void); extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); +extern void arch_uprobe_clear_state(struct mm_struct *mm); +extern void arch_uprobe_init_state(struct mm_struct *mm); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index da2b3d0deab6..2cd7a4c6f303 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1801,6 +1801,14 @@ static struct xol_area *get_xol_area(void) return area; } +void __weak arch_uprobe_clear_state(struct mm_struct *mm) +{ +} + +void __weak arch_uprobe_init_state(struct mm_struct *mm) +{ +} + /* * uprobe_clear_state - Free the area allocated for slots. */ @@ -1812,6 +1820,8 @@ void uprobe_clear_state(struct mm_struct *mm) delayed_uprobe_remove(NULL, mm); mutex_unlock(&delayed_uprobe_lock); + arch_uprobe_clear_state(mm); + if (!area) return; diff --git a/kernel/fork.c b/kernel/fork.c index af673856499d..d827cc6c5362 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1015,6 +1015,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; + arch_uprobe_init_state(mm); #endif } -- cgit v1.2.3 From 56101b69c9190667f473b9f93f8b6d8209aaa816 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:19 +0200 Subject: uprobes/x86: Add uprobe syscall to speed up uprobe Adding new uprobe syscall that calls uprobe handlers for given 'breakpoint' address. The idea is that the 'breakpoint' address calls the user space trampoline which executes the uprobe syscall. The syscall handler reads the return address of the initial call to retrieve the original 'breakpoint' address. With this address we find the related uprobe object and call its consumers. Adding the arch_uprobe_trampoline_mapping function that provides uprobe trampoline mapping. This mapping is backed with one global page initialized at __init time and shared by the all the mapping instances. We do not allow to execute uprobe syscall if the caller is not from uprobe trampoline mapping. The uprobe syscall ensures the consumer (bpf program) sees registers values in the state before the trampoline was called. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-10-jolsa@kernel.org --- arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/x86/kernel/uprobes.c | 139 +++++++++++++++++++++++++++++++++ include/linux/syscalls.h | 2 + include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 17 ++++ kernel/sys_ni.c | 1 + 6 files changed, 161 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 92cf0fe2291e..ced2a1deecd7 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 333 common io_pgetevents sys_io_pgetevents 334 common rseq sys_rseq 335 common uretprobe sys_uretprobe +336 common uprobe sys_uprobe # don't use numbers 387 through 423, add new calls after the last # 'common' entry 424 common pidfd_send_signal sys_pidfd_send_signal diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c4dcbdd0c3c..d18e1ae59901 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -752,6 +752,145 @@ void arch_uprobe_clear_state(struct mm_struct *mm) hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) destroy_uprobe_trampoline(tramp); } + +static bool __in_uprobe_trampoline(unsigned long ip) +{ + struct vm_area_struct *vma = vma_lookup(current->mm, ip); + + return vma && vma_is_special_mapping(vma, &tramp_mapping); +} + +static bool in_uprobe_trampoline(unsigned long ip) +{ + struct mm_struct *mm = current->mm; + bool found, retry = true; + unsigned int seq; + + rcu_read_lock(); + if (mmap_lock_speculate_try_begin(mm, &seq)) { + found = __in_uprobe_trampoline(ip); + retry = mmap_lock_speculate_retry(mm, seq); + } + rcu_read_unlock(); + + if (retry) { + mmap_read_lock(mm); + found = __in_uprobe_trampoline(ip); + mmap_read_unlock(mm); + } + return found; +} + +/* + * See uprobe syscall trampoline; the call to the trampoline will push + * the return address on the stack, the trampoline itself then pushes + * cx, r11 and ax. + */ +struct uprobe_syscall_args { + unsigned long ax; + unsigned long r11; + unsigned long cx; + unsigned long retaddr; +}; + +SYSCALL_DEFINE0(uprobe) +{ + struct pt_regs *regs = task_pt_regs(current); + struct uprobe_syscall_args args; + unsigned long ip, sp; + int err; + + /* Allow execution only from uprobe trampolines. */ + if (!in_uprobe_trampoline(regs->ip)) + goto sigill; + + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); + if (err) + goto sigill; + + ip = regs->ip; + + /* + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: + * - adjust ip to the probe address, call saved next instruction address + * - adjust sp to the probe's stack frame (check trampoline code) + */ + regs->ax = args.ax; + regs->r11 = args.r11; + regs->cx = args.cx; + regs->ip = args.retaddr - 5; + regs->sp += sizeof(args); + regs->orig_ax = -1; + + sp = regs->sp; + + handle_syscall_uprobe(regs, regs->ip); + + /* + * Some of the uprobe consumers has changed sp, we can do nothing, + * just return via iret. + */ + if (regs->sp != sp) { + /* skip the trampoline call */ + if (args.retaddr - 5 == regs->ip) + regs->ip += 5; + return regs->ax; + } + + regs->sp -= sizeof(args); + + /* for the case uprobe_consumer has changed ax/r11/cx */ + args.ax = regs->ax; + args.r11 = regs->r11; + args.cx = regs->cx; + + /* keep return address unless we are instructed otherwise */ + if (args.retaddr - 5 != regs->ip) + args.retaddr = regs->ip; + + regs->ip = ip; + + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); + if (err) + goto sigill; + + /* ensure sysret, see do_syscall_64() */ + regs->r11 = regs->flags; + regs->cx = regs->ip; + return 0; + +sigill: + force_sig(SIGILL); + return -1; +} + +asm ( + ".pushsection .rodata\n" + ".balign " __stringify(PAGE_SIZE) "\n" + "uprobe_trampoline_entry:\n" + "push %rcx\n" + "push %r11\n" + "push %rax\n" + "movq $" __stringify(__NR_uprobe) ", %rax\n" + "syscall\n" + "pop %rax\n" + "pop %r11\n" + "pop %rcx\n" + "ret\n" + ".balign " __stringify(PAGE_SIZE) "\n" + ".popsection\n" +); + +extern u8 uprobe_trampoline_entry[]; + +static int __init arch_uprobes_init(void) +{ + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); + return 0; +} + +late_initcall(arch_uprobes_init); + #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 77f45e5d4413..66c06fcdfe19 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on); asmlinkage long sys_uretprobe(void); +asmlinkage long sys_uprobe(void); + /* pciconfig: alpha, arm, arm64, ia64, sparc */ asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b40d33aae016..b6b077cc7d0f 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -239,6 +239,7 @@ extern unsigned long uprobe_get_trampoline_vaddr(void); extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); extern void arch_uprobe_clear_state(struct mm_struct *mm); extern void arch_uprobe_init_state(struct mm_struct *mm); +extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2cd7a4c6f303..eb07e602b6c9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2771,6 +2771,23 @@ out: rcu_read_unlock_trace(); } +void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) +{ + struct uprobe *uprobe; + int is_swbp; + + guard(rcu_tasks_trace)(); + + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); + if (!uprobe) + return; + if (!get_utask()) + return; + if (arch_uprobe_ignore(&uprobe->arch, regs)) + return; + handler_chain(uprobe, regs); +} + /* * Perform required fix-ups and disable singlestep. * Allow pending signals to take effect. diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c00a86931f8c..bf5d05c635ff 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -392,3 +392,4 @@ COND_SYSCALL(setuid16); COND_SYSCALL(rseq); COND_SYSCALL(uretprobe); +COND_SYSCALL(uprobe); -- cgit v1.2.3 From ba2bfc97b4629b10bd8d02b36e04f3932a04cac4 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sun, 20 Jul 2025 13:21:20 +0200 Subject: uprobes/x86: Add support to optimize uprobes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Putting together all the previously added pieces to support optimized uprobes on top of 5-byte nop instruction. The current uprobe execution goes through following: - installs breakpoint instruction over original instruction - exception handler hit and calls related uprobe consumers - and either simulates original instruction or does out of line single step execution of it - returns to user space The optimized uprobe path does following: - checks the original instruction is 5-byte nop (plus other checks) - adds (or uses existing) user space trampoline with uprobe syscall - overwrites original instruction (5-byte nop) with call to user space trampoline - the user space trampoline executes uprobe syscall that calls related uprobe consumers - trampoline returns back to next instruction This approach won't speed up all uprobes as it's limited to using nop5 as original instruction, but we plan to use nop5 as USDT probe instruction (which currently uses single byte nop) and speed up the USDT probes. The arch_uprobe_optimize triggers the uprobe optimization and is called after first uprobe hit. I originally had it called on uprobe installation but then it clashed with elf loader, because the user space trampoline was added in a place where loader might need to put elf segments, so I decided to do it after first uprobe hit when loading is done. The uprobe is un-optimized in arch specific set_orig_insn call. The instruction overwrite is x86 arch specific and needs to go through 3 updates: (on top of nop5 instruction) - write int3 into 1st byte - write last 4 bytes of the call instruction - update the call instruction opcode And cleanup goes though similar reverse stages: - overwrite call opcode with breakpoint (int3) - write last 4 bytes of the nop5 instruction - write the nop5 first instruction byte We do not unmap and release uprobe trampoline when it's no longer needed, because there's no easy way to make sure none of the threads is still inside the trampoline. But we do not waste memory, because there's just single page for all the uprobe trampoline mappings. We do waste frame on page mapping for every 4GB by keeping the uprobe trampoline page mapped, but that seems ok. We take the benefit from the fact that set_swbp and set_orig_insn are called under mmap_write_lock(mm), so we can use the current instruction as the state the uprobe is in - nop5/breakpoint/call trampoline - and decide the needed action (optimize/un-optimize) based on that. Attaching the speed up from benchs/run_bench_uprobes.sh script: current: usermode-count : 152.604 ± 0.044M/s syscall-count : 13.359 ± 0.042M/s --> uprobe-nop : 3.229 ± 0.002M/s uprobe-push : 3.086 ± 0.004M/s uprobe-ret : 1.114 ± 0.004M/s uprobe-nop5 : 1.121 ± 0.005M/s uretprobe-nop : 2.145 ± 0.002M/s uretprobe-push : 2.070 ± 0.001M/s uretprobe-ret : 0.931 ± 0.001M/s uretprobe-nop5 : 0.957 ± 0.001M/s after the change: usermode-count : 152.448 ± 0.244M/s syscall-count : 14.321 ± 0.059M/s uprobe-nop : 3.148 ± 0.007M/s uprobe-push : 2.976 ± 0.004M/s uprobe-ret : 1.068 ± 0.003M/s --> uprobe-nop5 : 7.038 ± 0.007M/s uretprobe-nop : 2.109 ± 0.004M/s uretprobe-push : 2.035 ± 0.001M/s uretprobe-ret : 0.908 ± 0.001M/s uretprobe-nop5 : 3.377 ± 0.009M/s I see bit more speed up on Intel (above) compared to AMD. The big nop5 speed up is partly due to emulating nop5 and partly due to optimization. The key speed up we do this for is the USDT switch from nop to nop5: uprobe-nop : 3.148 ± 0.007M/s uprobe-nop5 : 7.038 ± 0.007M/s Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Acked-by: Andrii Nakryiko Acked-by: Oleg Nesterov Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250720112133.244369-11-jolsa@kernel.org --- arch/x86/include/asm/uprobes.h | 7 + arch/x86/kernel/uprobes.c | 283 ++++++++++++++++++++++++++++++++++++++++- include/linux/uprobes.h | 6 +- kernel/events/uprobes.c | 16 ++- 4 files changed, 305 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 678fb546f0a7..1ee2e5115955 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +enum { + ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0, + ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1, +}; + struct uprobe_xol_ops; struct arch_uprobe { @@ -45,6 +50,8 @@ struct arch_uprobe { u8 ilen; } push; }; + + unsigned long flags; }; struct arch_uprobe_task { diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index d18e1ae59901..209ce74ab93f 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -18,6 +18,7 @@ #include #include #include +#include /* Post-execution fixups. */ @@ -702,7 +703,6 @@ static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr) return tramp; } -__maybe_unused static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new) { struct uprobes_state *state = ¤t->mm->uprobes_state; @@ -891,6 +891,280 @@ static int __init arch_uprobes_init(void) late_initcall(arch_uprobes_init); +enum { + EXPECT_SWBP, + EXPECT_CALL, +}; + +struct write_opcode_ctx { + unsigned long base; + int expect; +}; + +static int is_call_insn(uprobe_opcode_t *insn) +{ + return *insn == CALL_INSN_OPCODE; +} + +/* + * Verification callback used by int3_update uprobe_write calls to make sure + * the underlying instruction is as expected - either int3 or call. + */ +static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode, + int nbytes, void *data) +{ + struct write_opcode_ctx *ctx = data; + uprobe_opcode_t old_opcode[5]; + + uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5); + + switch (ctx->expect) { + case EXPECT_SWBP: + if (is_swbp_insn(&old_opcode[0])) + return 1; + break; + case EXPECT_CALL: + if (is_call_insn(&old_opcode[0])) + return 1; + break; + } + + return -1; +} + +/* + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. + * (borrowed comment from smp_text_poke_batch_finish) + * + * The way it is done: + * - Add an INT3 trap to the address that will be patched + * - SMP sync all CPUs + * - Update all but the first byte of the patched range + * - SMP sync all CPUs + * - Replace the first byte (INT3) by the first byte of the replacing opcode + * - SMP sync all CPUs + */ +static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, char *insn, bool optimize) +{ + uprobe_opcode_t int3 = UPROBE_SWBP_INSN; + struct write_opcode_ctx ctx = { + .base = vaddr, + }; + int err; + + /* + * Write int3 trap. + * + * The swbp_optimize path comes with breakpoint already installed, + * so we can skip this step for optimize == true. + */ + if (!optimize) { + ctx.expect = EXPECT_CALL; + err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + } + + smp_text_poke_sync_each_cpu(); + + /* Write all but the first byte of the patched range. */ + ctx.expect = EXPECT_SWBP; + err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn, + true /* is_register */, false /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + + /* + * Write first byte. + * + * The swbp_unoptimize needs to finish uprobe removal together + * with ref_ctr update, using uprobe_write with proper flags. + */ + err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn, + optimize /* is_register */, !optimize /* do_update_ref_ctr */, + &ctx); + if (err) + return err; + + smp_text_poke_sync_each_cpu(); + return 0; +} + +static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr, unsigned long tramp) +{ + u8 call[5]; + + __text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr, + (const void *) tramp, CALL_INSN_SIZE); + return int3_update(auprobe, vma, vaddr, call, true /* optimize */); +} + +static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */); +} + +static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len) +{ + unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD; + struct vm_area_struct *vma; + struct page *page; + + page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); + if (IS_ERR(page)) + return PTR_ERR(page); + uprobe_copy_from_page(page, vaddr, dst, len); + put_page(page); + return 0; +} + +static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr) +{ + struct __packed __arch_relative_insn { + u8 op; + s32 raddr; + } *call = (struct __arch_relative_insn *) insn; + + if (!is_call_insn(insn)) + return false; + return __in_uprobe_trampoline(vaddr + 5 + call->raddr); +} + +static int is_optimized(struct mm_struct *mm, unsigned long vaddr, bool *optimized) +{ + uprobe_opcode_t insn[5]; + int err; + + err = copy_from_vaddr(mm, vaddr, &insn, 5); + if (err) + return err; + *optimized = __is_optimized((uprobe_opcode_t *)&insn, vaddr); + return 0; +} + +static bool should_optimize(struct arch_uprobe *auprobe) +{ + return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) && + test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); +} + +int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (should_optimize(auprobe)) { + bool optimized = false; + int err; + + /* + * We could race with another thread that already optimized the probe, + * so let's not overwrite it with int3 again in this case. + */ + err = is_optimized(vma->vm_mm, vaddr, &optimized); + if (err) + return err; + if (optimized) + return 0; + } + return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, + true /* is_register */); +} + +int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, + unsigned long vaddr) +{ + if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) { + struct mm_struct *mm = vma->vm_mm; + bool optimized = false; + int err; + + err = is_optimized(mm, vaddr, &optimized); + if (err) + return err; + if (optimized) { + err = swbp_unoptimize(auprobe, vma, vaddr); + WARN_ON_ONCE(err); + return err; + } + } + return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn, + false /* is_register */); +} + +static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long vaddr) +{ + struct uprobe_trampoline *tramp; + struct vm_area_struct *vma; + bool new = false; + int err = 0; + + vma = find_vma(mm, vaddr); + if (!vma) + return -EINVAL; + tramp = get_uprobe_trampoline(vaddr, &new); + if (!tramp) + return -EINVAL; + err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr); + if (WARN_ON_ONCE(err) && new) + destroy_uprobe_trampoline(tramp); + return err; +} + +void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + struct mm_struct *mm = current->mm; + uprobe_opcode_t insn[5]; + + /* + * Do not optimize if shadow stack is enabled, the return address hijack + * code in arch_uretprobe_hijack_return_addr updates wrong frame when + * the entry uprobe is optimized and the shadow stack crashes the app. + */ + if (shstk_is_enabled()) + return; + + if (!should_optimize(auprobe)) + return; + + mmap_write_lock(mm); + + /* + * Check if some other thread already optimized the uprobe for us, + * if it's the case just go away silently. + */ + if (copy_from_vaddr(mm, vaddr, &insn, 5)) + goto unlock; + if (!is_swbp_insn((uprobe_opcode_t*) &insn)) + goto unlock; + + /* + * If we fail to optimize the uprobe we set the fail bit so the + * above should_optimize will fail from now on. + */ + if (__arch_uprobe_optimize(auprobe, mm, vaddr)) + set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags); + +unlock: + mmap_write_unlock(mm); +} + +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + if (memcmp(&auprobe->insn, x86_nops[5], 5)) + return false; + /* We can't do cross page atomic writes yet. */ + return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5; +} #else /* 32-bit: */ /* * No RIP-relative addressing on 32-bit @@ -904,6 +1178,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { } +static bool can_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ + return false; +} #endif /* CONFIG_X86_64 */ struct uprobe_xol_ops { @@ -1270,6 +1548,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret) return ret; + if (can_optimize(auprobe, addr)) + set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags); + ret = branch_setup_xol_ops(auprobe, &insn); if (ret != -ENOSYS) return ret; diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b6b077cc7d0f..08ef78439d0d 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -192,7 +192,7 @@ struct uprobes_state { }; typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr, - uprobe_opcode_t *insn, int nbytes); + uprobe_opcode_t *insn, int nbytes, void *data); extern void __init uprobes_init(void); extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr); @@ -204,7 +204,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t, bool is_register); extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr, - uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr); + uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); @@ -240,6 +241,7 @@ extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void * extern void arch_uprobe_clear_state(struct mm_struct *mm); extern void arch_uprobe_init_state(struct mm_struct *mm); extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); +extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eb07e602b6c9..4a194d7c838b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -192,7 +192,7 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src } static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn, - int nbytes) + int nbytes, void *data) { uprobe_opcode_t old_opcode; bool is_swbp; @@ -491,12 +491,13 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, bool is_register) { return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE, - verify_opcode, is_register, true /* do_update_ref_ctr */); + verify_opcode, is_register, true /* do_update_ref_ctr */, NULL); } int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes, - uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr) + uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr, + void *data) { const unsigned long vaddr = insn_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; @@ -530,7 +531,7 @@ retry: goto out; folio = page_folio(page); - ret = verify(page, insn_vaddr, insn, nbytes); + ret = verify(page, insn_vaddr, insn, nbytes, data); if (ret <= 0) { folio_put(folio); goto out; @@ -2696,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c return true; } +void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr) +{ +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -2760,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); + /* Try to optimize after first hit. */ + arch_uprobe_optimize(&uprobe->arch, bp_vaddr); + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; -- cgit v1.2.3 From 3202d6ed9368fc1e842fda73727553ae614633f8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 21 Aug 2025 15:07:50 +0200 Subject: mmc: core: Add infrastructure for undervoltage handling Implement the core infrastructure to allow MMC bus types to handle REGULATOR_EVENT_UNDER_VOLTAGE events from power regulators. This is primarily aimed at allowing devices like eMMC to perform an emergency shutdown to prevent data corruption when a power failure is imminent. This patch introduces: - A new 'handle_undervoltage' function pointer to 'struct mmc_bus_ops'. Bus drivers (e.g., for eMMC) can implement this to define their emergency procedures. - A workqueue ('uv_work') in 'struct mmc_supply' to handle the event asynchronously in a high-priority context. - A new function 'mmc_handle_undervoltage()' which is called from the workqueue. It stops the host queue to prevent races with card removal, checks for the bus op, and invokes the handler. - Functions to register and unregister the regulator notifier, intended to be called by bus drivers like 'mmc_attach_mmc' when a compatible card is detected. The notifier is only registered for the main vmmc supply, as undervoltage handling for vqmmc or vqmmc2 is not required at this time. Signed-off-by: Oleksij Rempel Link: https://lore.kernel.org/r/20250821130751.2089587-2-o.rempel@pengutronix.de Signed-off-by: Ulf Hansson --- drivers/mmc/core/bus.c | 12 +++++++ drivers/mmc/core/core.c | 23 +++++++++++++ drivers/mmc/core/core.h | 5 +++ drivers/mmc/core/host.c | 2 ++ drivers/mmc/core/regulator.c | 77 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/mmc/host.h | 11 +++++++ 6 files changed, 130 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index 1cf64e0952fb..ec4f3462bf80 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -19,6 +19,7 @@ #include #include +#include #include "core.h" #include "card.h" @@ -383,6 +384,14 @@ int mmc_add_card(struct mmc_card *card) mmc_card_set_present(card); + /* + * Register for undervoltage notification if the card supports + * power-off notification, enabling emergency shutdowns. + */ + if (mmc_card_mmc(card) && + card->ext_csd.power_off_notification == EXT_CSD_POWER_ON) + mmc_regulator_register_undervoltage_notifier(card->host); + return 0; } @@ -394,6 +403,9 @@ void mmc_remove_card(struct mmc_card *card) { struct mmc_host *host = card->host; + if (mmc_card_present(card)) + mmc_regulator_unregister_undervoltage_notifier(host); + mmc_remove_card_debugfs(card); if (mmc_card_present(card)) { diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 88fd231fee1d..860378bea557 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -1398,6 +1398,29 @@ void mmc_power_cycle(struct mmc_host *host, u32 ocr) mmc_power_up(host, ocr); } +/** + * mmc_handle_undervoltage - Handle an undervoltage event on the MMC bus + * @host: The MMC host that detected the undervoltage condition + * + * This function is called when an undervoltage event is detected on one of + * the MMC regulators. + * + * Returns: 0 on success or a negative error code on failure. + */ +int mmc_handle_undervoltage(struct mmc_host *host) +{ + /* Stop the host to prevent races with card removal */ + __mmc_stop_host(host); + + if (!host->bus_ops || !host->bus_ops->handle_undervoltage) + return 0; + + dev_warn(mmc_dev(host), "%s: Undervoltage detected, initiating emergency stop\n", + mmc_hostname(host)); + + return host->bus_ops->handle_undervoltage(host); +} + /* * Assign a mmc bus handler to a host. Only one bus handler may control a * host at any given time. diff --git a/drivers/mmc/core/core.h b/drivers/mmc/core/core.h index 73f5d3d8c77d..a028b48be164 100644 --- a/drivers/mmc/core/core.h +++ b/drivers/mmc/core/core.h @@ -31,6 +31,7 @@ struct mmc_bus_ops { int (*sw_reset)(struct mmc_host *); bool (*cache_enabled)(struct mmc_host *); int (*flush_cache)(struct mmc_host *); + int (*handle_undervoltage)(struct mmc_host *host); }; void mmc_attach_bus(struct mmc_host *host, const struct mmc_bus_ops *ops); @@ -59,6 +60,10 @@ void mmc_power_off(struct mmc_host *host); void mmc_power_cycle(struct mmc_host *host, u32 ocr); void mmc_set_initial_state(struct mmc_host *host); u32 mmc_vddrange_to_ocrmask(int vdd_min, int vdd_max); +int mmc_handle_undervoltage(struct mmc_host *host); +void mmc_regulator_register_undervoltage_notifier(struct mmc_host *host); +void mmc_regulator_unregister_undervoltage_notifier(struct mmc_host *host); +void mmc_undervoltage_workfn(struct work_struct *work); static inline void mmc_delay(unsigned int ms) { diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index f14671ea5716..5f0ec23aeff5 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -564,6 +564,8 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) INIT_WORK(&host->sdio_irq_work, sdio_irq_work); timer_setup(&host->retune_timer, mmc_retune_timer, 0); + INIT_WORK(&host->supply.uv_work, mmc_undervoltage_workfn); + /* * By default, hosts do not support SGIO or large requests. * They have to set these according to their abilities. diff --git a/drivers/mmc/core/regulator.c b/drivers/mmc/core/regulator.c index 3dae2e9b7978..a85179f1a4de 100644 --- a/drivers/mmc/core/regulator.c +++ b/drivers/mmc/core/regulator.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -262,6 +263,82 @@ static inline int mmc_regulator_get_ocrmask(struct regulator *supply) #endif /* CONFIG_REGULATOR */ +/* To be called from a high-priority workqueue */ +void mmc_undervoltage_workfn(struct work_struct *work) +{ + struct mmc_supply *supply; + struct mmc_host *host; + + supply = container_of(work, struct mmc_supply, uv_work); + host = container_of(supply, struct mmc_host, supply); + + mmc_handle_undervoltage(host); +} + +static int mmc_handle_regulator_event(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct mmc_supply *supply = container_of(nb, struct mmc_supply, + vmmc_nb); + struct mmc_host *host = container_of(supply, struct mmc_host, supply); + unsigned long flags; + + switch (event) { + case REGULATOR_EVENT_UNDER_VOLTAGE: + spin_lock_irqsave(&host->lock, flags); + if (host->undervoltage) { + spin_unlock_irqrestore(&host->lock, flags); + return NOTIFY_OK; + } + + host->undervoltage = true; + spin_unlock_irqrestore(&host->lock, flags); + + queue_work(system_highpri_wq, &host->supply.uv_work); + break; + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + +/** + * mmc_regulator_register_undervoltage_notifier - Register for undervoltage + * events + * @host: MMC host + * + * To be called by a bus driver when a card supporting graceful shutdown + * is attached. + */ +void mmc_regulator_register_undervoltage_notifier(struct mmc_host *host) +{ + int ret; + + if (IS_ERR_OR_NULL(host->supply.vmmc)) + return; + + host->supply.vmmc_nb.notifier_call = mmc_handle_regulator_event; + ret = regulator_register_notifier(host->supply.vmmc, + &host->supply.vmmc_nb); + if (ret) + dev_warn(mmc_dev(host), "Failed to register vmmc notifier: %d\n", ret); +} + +/** + * mmc_regulator_unregister_undervoltage_notifier - Unregister undervoltage + * notifier + * @host: MMC host + */ +void mmc_regulator_unregister_undervoltage_notifier(struct mmc_host *host) +{ + if (IS_ERR_OR_NULL(host->supply.vmmc)) + return; + + regulator_unregister_notifier(host->supply.vmmc, &host->supply.vmmc_nb); + cancel_work_sync(&host->supply.uv_work); +} + /** * mmc_regulator_get_supply - try to get VMMC and VQMMC regulators for a host * @mmc: the host to regulate diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 5ed5d203de23..e0d935a4ac1d 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -337,11 +337,15 @@ struct mmc_slot { struct regulator; struct mmc_pwrseq; +struct notifier_block; struct mmc_supply { struct regulator *vmmc; /* Card power supply */ struct regulator *vqmmc; /* Optional Vccq supply */ struct regulator *vqmmc2; /* Optional supply for phy */ + + struct notifier_block vmmc_nb; /* Notifier for vmmc */ + struct work_struct uv_work; /* Undervoltage work */ }; struct mmc_ctx { @@ -494,6 +498,13 @@ struct mmc_host { unsigned int can_dma_map_merge:1; /* merging can be used */ unsigned int vqmmc_enabled:1; /* vqmmc regulator is enabled */ + /* + * Indicates if an undervoltage event has already been handled. + * This prevents repeated regulator notifiers from triggering + * multiple REGULATOR_EVENT_UNDER_VOLTAGE events. + */ + unsigned int undervoltage:1; /* Undervoltage state */ + int rescan_disable; /* disable card detection */ int rescan_entered; /* used with nonremovable devices */ -- cgit v1.2.3 From f41345f47fb267a9c95ca710c33448f8d0d81d83 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 20 Aug 2025 15:18:06 +0200 Subject: bpf: Use tnums for JEQ/JNE is_branch_taken logic In the following toy program (reg states minimized for readability), R0 and R1 always have different values at instruction 6. This is obvious when reading the program but cannot be guessed from ranges alone as they overlap (R0 in [0; 0xc0000000], R1 in [1024; 0xc0000400]). 0: call bpf_get_prandom_u32#7 ; R0_w=scalar() 1: w0 = w0 ; R0_w=scalar(var_off=(0x0; 0xffffffff)) 2: r0 >>= 30 ; R0_w=scalar(var_off=(0x0; 0x3)) 3: r0 <<= 30 ; R0_w=scalar(var_off=(0x0; 0xc0000000)) 4: r1 = r0 ; R1_w=scalar(var_off=(0x0; 0xc0000000)) 5: r1 += 1024 ; R1_w=scalar(var_off=(0x400; 0xc0000000)) 6: if r1 != r0 goto pc+1 Looking at tnums however, we can deduce that R1 is always different from R0 because their tnums don't agree on known bits. This patch uses this logic to improve is_scalar_branch_taken in case of BPF_JEQ and BPF_JNE. This change has a tiny impact on complexity, which was measured with the Cilium complexity CI test. That test covers 72 programs with various build and load time configurations for a total of 970 test cases. For 80% of test cases, the patch has no impact. On the other test cases, the patch decreases complexity by only 0.08% on average. In the best case, the verifier needs to walk 3% less instructions and, in the worst case, 1.5% more. Overall, the patch has a small positive impact, especially for our largest programs. Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/be3ee70b6e489c49881cb1646114b1d861b5c334.1755694147.git.paul.chaignon@gmail.com --- include/linux/tnum.h | 3 +++ kernel/bpf/tnum.c | 8 ++++++++ kernel/bpf/verifier.c | 4 ++++ 3 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 57ed3035cc30..0ffb77ffe0e8 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -51,6 +51,9 @@ struct tnum tnum_xor(struct tnum a, struct tnum b); /* Multiply two tnums, return @a * @b */ struct tnum tnum_mul(struct tnum a, struct tnum b); +/* Return true if the known bits of both tnums have the same value */ +bool tnum_overlap(struct tnum a, struct tnum b); + /* Return a tnum representing numbers satisfying both @a and @b */ struct tnum tnum_intersect(struct tnum a, struct tnum b); diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index fa353c5d550f..d9328bbb3680 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -143,6 +143,14 @@ struct tnum tnum_mul(struct tnum a, struct tnum b) return tnum_add(TNUM(acc_v, 0), acc_m); } +bool tnum_overlap(struct tnum a, struct tnum b) +{ + u64 mu; + + mu = ~a.mask & ~b.mask; + return (a.value & mu) == (b.value & mu); +} + /* Note that if a and b disagree - i.e. one has a 'known 1' where the other has * a 'known 0' - this will return a 'known 1' for that bit. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4e47992361ea..5c9dd16b2c56 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15897,6 +15897,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value == t2.value; + if (!tnum_overlap(t1, t2)) + return 0; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 0; @@ -15921,6 +15923,8 @@ static int is_scalar_branch_taken(struct bpf_reg_state *reg1, struct bpf_reg_sta */ if (tnum_is_const(t1) && tnum_is_const(t2)) return t1.value != t2.value; + if (!tnum_overlap(t1, t2)) + return 1; /* non-overlapping ranges */ if (umin1 > umax2 || umax1 < umin2) return 1; -- cgit v1.2.3 From afa3701c0e45ecb9e4d160048ca4e353c7489948 Mon Sep 17 00:00:00 2001 From: Tiffany Yang Date: Thu, 21 Aug 2025 18:37:52 -0700 Subject: cgroup: cgroup.stat.local time accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There isn't yet a clear way to identify a set of "lost" time that everyone (or at least a wider group of users) cares about. However, users can perform some delay accounting by iterating over components of interest. This patch allows cgroup v2 freezing time to be one of those components. Track the cumulative time that each v2 cgroup spends freezing and expose it to userland via a new local stat file in cgroupfs. Thank you to Michal, who provided the ASCII art in the updated documentation. To access this value: $ mkdir /sys/fs/cgroup/test $ cat /sys/fs/cgroup/test/cgroup.stat.local freeze_time_total 0 Ensure consistent freeze time reads with freeze_seq, a per-cgroup sequence counter. Writes are serialized using the css_set_lock. Signed-off-by: Tiffany Yang Cc: Tejun Heo Cc: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 18 ++++++++++++++++++ include/linux/cgroup-defs.h | 17 +++++++++++++++++ kernel/cgroup/cgroup.c | 28 ++++++++++++++++++++++++++++ kernel/cgroup/freezer.c | 16 ++++++++++++---- 4 files changed, 75 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index d9d3cc7df348..9a3a909ee40b 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1001,6 +1001,24 @@ All cgroup core files are prefixed with "cgroup." Total number of dying cgroup subsystems (e.g. memory cgroup) at and beneath the current cgroup. + cgroup.stat.local + A read-only flat-keyed file which exists in non-root cgroups. + The following entry is defined: + + frozen_usec + Cumulative time that this cgroup has spent between freezing and + thawing, regardless of whether by self or ancestor groups. + NB: (not) reaching "frozen" state is not accounted here. + + Using the following ASCII representation of a cgroup's freezer + state, :: + + 1 _____ + frozen 0 __/ \__ + ab cd + + the duration being measured is the span between a and c. + cgroup.freeze A read-write single value file which exists on non-root cgroups. Allowed values are "0" and "1". The default is "0". diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 6b93a64115fe..539c64eeef38 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -433,6 +433,23 @@ struct cgroup_freezer_state { * frozen, SIGSTOPped, and PTRACEd. */ int nr_frozen_tasks; + + /* Freeze time data consistency protection */ + seqcount_t freeze_seq; + + /* + * Most recent time the cgroup was requested to freeze. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 freeze_start_nsec; + + /* + * Total duration the cgroup has spent freezing. + * Accesses guarded by freeze_seq counter. Writes serialized + * by css_set_lock. + */ + u64 frozen_nsec; }; struct cgroup { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..ab096b884bbc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3763,6 +3763,27 @@ static int cgroup_stat_show(struct seq_file *seq, void *v) return 0; } +static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + unsigned int sequence; + u64 freeze_time; + + do { + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); + freeze_time = cgrp->freezer.frozen_nsec; + /* Add in current freezer interval if the cgroup is freezing. */ + if (test_bit(CGRP_FREEZE, &cgrp->flags)) + freeze_time += (ktime_get_ns() - + cgrp->freezer.freeze_start_nsec); + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); + + seq_printf(seq, "frozen_usec %llu\n", + (unsigned long long) freeze_time / NSEC_PER_USEC); + + return 0; +} + #ifdef CONFIG_CGROUP_SCHED /** * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem @@ -5354,6 +5375,11 @@ static struct cftype cgroup_base_files[] = { .name = "cgroup.stat", .seq_show = cgroup_stat_show, }, + { + .name = "cgroup.stat.local", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_core_local_stat_show, + }, { .name = "cgroup.freeze", .flags = CFTYPE_NOT_ON_ROOT, @@ -5763,6 +5789,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * if the parent has to be frozen, the child has too. */ cgrp->freezer.e_freeze = parent->freezer.e_freeze; + seqcount_init(&cgrp->freezer.freeze_seq); if (cgrp->freezer.e_freeze) { /* * Set the CGRP_FREEZE flag, so when a process will be @@ -5771,6 +5798,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, * consider it frozen immediately. */ set_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); set_bit(CGRP_FROZEN, &cgrp->flags); } diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index bf1690a167dd..6c18854bff34 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -171,7 +171,7 @@ static void cgroup_freeze_task(struct task_struct *task, bool freeze) /* * Freeze or unfreeze all tasks in the given cgroup. */ -static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) { struct css_task_iter it; struct task_struct *task; @@ -179,10 +179,16 @@ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - if (freeze) + write_seqcount_begin(&cgrp->freezer.freeze_seq); + if (freeze) { set_bit(CGRP_FREEZE, &cgrp->flags); - else + cgrp->freezer.freeze_start_nsec = ts_nsec; + } else { clear_bit(CGRP_FREEZE, &cgrp->flags); + cgrp->freezer.frozen_nsec += (ts_nsec - + cgrp->freezer.freeze_start_nsec); + } + write_seqcount_end(&cgrp->freezer.freeze_seq); spin_unlock_irq(&css_set_lock); if (freeze) @@ -260,6 +266,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) struct cgroup *parent; struct cgroup *dsct; bool applied = false; + u64 ts_nsec; bool old_e; lockdep_assert_held(&cgroup_mutex); @@ -271,6 +278,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) return; cgrp->freezer.freeze = freeze; + ts_nsec = ktime_get_ns(); /* * Propagate changes downwards the cgroup tree. @@ -298,7 +306,7 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze) /* * Do change actual state: freeze or unfreeze. */ - cgroup_do_freeze(dsct, freeze); + cgroup_do_freeze(dsct, freeze, ts_nsec); applied = true; } -- cgit v1.2.3 From d47cc4dea17391c99b943fa8d70a279e906b2843 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 11 Aug 2025 13:16:15 -0700 Subject: bpf: Use sha1() instead of sha1_transform() in bpf_prog_calc_tag() Now that there's a proper SHA-1 library API, just use that instead of the low-level SHA-1 compression function. This eliminates the need for bpf_prog_calc_tag() to implement the SHA-1 padding itself. No functional change; the computed tags remain the same. Signed-off-by: Eric Biggers Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20250811201615.564461-1-ebiggers@kernel.org --- include/linux/filter.h | 6 ------ kernel/bpf/core.c | 50 +++++++++----------------------------------------- 2 files changed, 9 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index c0a74fb9fcb1..9092d8ea95c8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -997,12 +997,6 @@ static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) return prog->len * sizeof(struct bpf_insn); } -static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) -{ - return round_up(bpf_prog_insn_size(prog) + - sizeof(__be64) + 1, SHA1_BLOCK_SIZE); -} - static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5d1650af899d..ef01cc644a96 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include @@ -293,28 +294,19 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { - const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); - u32 raw_size = bpf_prog_tag_scratch_size(fp); - u32 digest[SHA1_DIGEST_WORDS]; - u32 ws[SHA1_WORKSPACE_WORDS]; - u32 i, bsize, psize, blocks; + size_t size = bpf_prog_insn_size(fp); + u8 digest[SHA1_DIGEST_SIZE]; struct bpf_insn *dst; bool was_ld_map; - u8 *raw, *todo; - __be32 *result; - __be64 *bits; + u32 i; - raw = vmalloc(raw_size); - if (!raw) + dst = vmalloc(size); + if (!dst) return -ENOMEM; - sha1_init_raw(digest); - memset(ws, 0, sizeof(ws)); - /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ - dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && @@ -334,33 +326,9 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) was_ld_map = false; } } - - psize = bpf_prog_insn_size(fp); - memset(&raw[psize], 0, raw_size - psize); - raw[psize++] = 0x80; - - bsize = round_up(psize, SHA1_BLOCK_SIZE); - blocks = bsize / SHA1_BLOCK_SIZE; - todo = raw; - if (bsize - psize >= sizeof(__be64)) { - bits = (__be64 *)(todo + bsize - sizeof(__be64)); - } else { - bits = (__be64 *)(todo + bsize + bits_offset); - blocks++; - } - *bits = cpu_to_be64((psize - 1) << 3); - - while (blocks--) { - sha1_transform(digest, todo, ws); - todo += SHA1_BLOCK_SIZE; - } - - result = (__force __be32 *)digest; - for (i = 0; i < SHA1_DIGEST_WORDS; i++) - result[i] = cpu_to_be32(digest[i]); - memcpy(fp->tag, result, sizeof(fp->tag)); - - vfree(raw); + sha1((const u8 *)dst, size, digest); + memcpy(fp->tag, digest, sizeof(fp->tag)); + vfree(dst); return 0; } -- cgit v1.2.3 From 3c716487936aa54083c130d46ad5747769695e09 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 14 Aug 2025 18:59:49 +0200 Subject: genirq: Remove GENERIC_IRQ_LEGACY IA64 is gone and with it the last GENERIC_IRQ_LEGACY user. Remove GENERIC_IRQ_LEGACY. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250814165949.hvtP03r4@linutronix.de --- include/linux/irq.h | 4 ---- kernel/irq/Kconfig | 4 ---- kernel/irq/irqdesc.c | 7 ------- 3 files changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 1d6b606a81ef..c9bcdbf6bc63 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -976,10 +976,6 @@ static inline void irq_free_desc(unsigned int irq) irq_free_descs(irq, 1); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq); -#endif - /** * struct irq_chip_regs - register offsets for struct irq_gci * @enable: Enable register offset to reg_base diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1da5e9d9da71..36673640c4fc 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -6,10 +6,6 @@ menu "IRQ subsystem" config MAY_HAVE_SPARSE_IRQ bool -# Legacy support, required for itanic -config GENERIC_IRQ_LEGACY - bool - # Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index b64c57b44c20..db714d3014b5 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -653,13 +653,6 @@ void irq_mark_irq(unsigned int irq) irq_insert_desc(irq, irq_desc + irq); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY -void irq_init_desc(unsigned int irq) -{ - free_desc(irq); -} -#endif - #endif /* !CONFIG_SPARSE_IRQ */ int handle_irq_desc(struct irq_desc *desc) -- cgit v1.2.3 From 7a721a2fee2bce01af26699a87739db8ca8ea3c8 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 14 Aug 2025 07:28:31 +0800 Subject: genirq: Add irq_chip_(startup/shutdown)_parent() As the MSI controller on SG2044 uses PLIC as the underlying interrupt controller, it needs to call irq_enable() and irq_disable() to startup/shutdown interrupts. Otherwise, the MSI interrupt can not be startup correctly and will not respond any incoming interrupt. Introduce irq_chip_startup_parent() and irq_chip_shutdown_parent() to allow the interrupt controller to call the irq_startup()/irq_shutdown() callbacks of the parent interrupt chip. In case the irq_startup()/irq_shutdown() callbacks are not implemented for the parent interrupt chip, this will fallback to irq_chip_enable_parent() or irq_chip_disable_parent(). Suggested-by: Thomas Gleixner Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Tested-by: Chen Wang # Pioneerbox Reviewed-by: Chen Wang Link: https://lore.kernel.org/all/20250813232835.43458-2-inochiama@gmail.com Link: https://lore.kernel.org/lkml/20250722224513.22125-1-inochiama@gmail.com/ --- include/linux/irq.h | 2 ++ kernel/irq/chip.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index c9bcdbf6bc63..c67e76fbcc07 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -669,6 +669,8 @@ extern int irq_chip_set_parent_state(struct irq_data *data, extern int irq_chip_get_parent_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); +extern void irq_chip_shutdown_parent(struct irq_data *data); +extern unsigned int irq_chip_startup_parent(struct irq_data *data); extern void irq_chip_enable_parent(struct irq_data *data); extern void irq_chip_disable_parent(struct irq_data *data); extern void irq_chip_ack_parent(struct irq_data *data); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0d0276378c70..3ffa0d80ddd1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1259,6 +1259,43 @@ int irq_chip_get_parent_state(struct irq_data *data, } EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); +/** + * irq_chip_shutdown_parent - Shutdown the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_shutdown() callback of the parent if available or falls + * back to irq_chip_disable_parent(). + */ +void irq_chip_shutdown_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_shutdown) + parent->chip->irq_shutdown(parent); + else + irq_chip_disable_parent(data); +} +EXPORT_SYMBOL_GPL(irq_chip_shutdown_parent); + +/** + * irq_chip_startup_parent - Startup the parent interrupt + * @data: Pointer to interrupt specific data + * + * Invokes the irq_startup() callback of the parent if available or falls + * back to irq_chip_enable_parent(). + */ +unsigned int irq_chip_startup_parent(struct irq_data *data) +{ + struct irq_data *parent = data->parent_data; + + if (parent->chip->irq_startup) + return parent->chip->irq_startup(parent); + + irq_chip_enable_parent(data); + return 0; +} +EXPORT_SYMBOL_GPL(irq_chip_startup_parent); + /** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) -- cgit v1.2.3 From 54f45a30c0d0153d2be091ba2d683ab6db6d1d5b Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 14 Aug 2025 07:28:32 +0800 Subject: PCI/MSI: Add startup/shutdown for per device domains As the RISC-V PLIC cannot apply affinity settings without invoking irq_enable(), it will make the interrupt unavailble when used as an underlying interrupt chip for the MSI controller. Implement the irq_startup() and irq_shutdown() callbacks for the PCI MSI and MSI-X templates. For chips that specify MSI_FLAG_PCI_MSI_STARTUP_PARENT, the parent startup and shutdown functions are invoked. That allows the interrupt on the parent chip to be enabled if the interrupt has not been enabled during allocation. This is necessary for MSI controllers which use PLIC as underlying parent interrupt chip. Suggested-by: Thomas Gleixner Signed-off-by: Inochi Amaoto Signed-off-by: Thomas Gleixner Tested-by: Chen Wang # Pioneerbox Reviewed-by: Chen Wang Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/all/20250813232835.43458-3-inochiama@gmail.com --- drivers/pci/msi/irqdomain.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/msi.h | 2 ++ 2 files changed, 54 insertions(+) (limited to 'include/linux') diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 0938ef7ebabf..e0a800f918e8 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -148,6 +148,23 @@ static void pci_device_domain_set_desc(msi_alloc_info_t *arg, struct msi_desc *d arg->hwirq = desc->msi_index; } +static void cond_shutdown_parent(struct irq_data *data) +{ + struct msi_domain_info *info = data->domain->host_data; + + if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) + irq_chip_shutdown_parent(data); +} + +static unsigned int cond_startup_parent(struct irq_data *data) +{ + struct msi_domain_info *info = data->domain->host_data; + + if (unlikely(info->flags & MSI_FLAG_PCI_MSI_STARTUP_PARENT)) + return irq_chip_startup_parent(data); + return 0; +} + static __always_inline void cond_mask_parent(struct irq_data *data) { struct msi_domain_info *info = data->domain->host_data; @@ -164,6 +181,23 @@ static __always_inline void cond_unmask_parent(struct irq_data *data) irq_chip_unmask_parent(data); } +static void pci_irq_shutdown_msi(struct irq_data *data) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + + pci_msi_mask(desc, BIT(data->irq - desc->irq)); + cond_shutdown_parent(data); +} + +static unsigned int pci_irq_startup_msi(struct irq_data *data) +{ + struct msi_desc *desc = irq_data_get_msi_desc(data); + unsigned int ret = cond_startup_parent(data); + + pci_msi_unmask(desc, BIT(data->irq - desc->irq)); + return ret; +} + static void pci_irq_mask_msi(struct irq_data *data) { struct msi_desc *desc = irq_data_get_msi_desc(data); @@ -194,6 +228,8 @@ static void pci_irq_unmask_msi(struct irq_data *data) static const struct msi_domain_template pci_msi_template = { .chip = { .name = "PCI-MSI", + .irq_startup = pci_irq_startup_msi, + .irq_shutdown = pci_irq_shutdown_msi, .irq_mask = pci_irq_mask_msi, .irq_unmask = pci_irq_unmask_msi, .irq_write_msi_msg = pci_msi_domain_write_msg, @@ -210,6 +246,20 @@ static const struct msi_domain_template pci_msi_template = { }, }; +static void pci_irq_shutdown_msix(struct irq_data *data) +{ + pci_msix_mask(irq_data_get_msi_desc(data)); + cond_shutdown_parent(data); +} + +static unsigned int pci_irq_startup_msix(struct irq_data *data) +{ + unsigned int ret = cond_startup_parent(data); + + pci_msix_unmask(irq_data_get_msi_desc(data)); + return ret; +} + static void pci_irq_mask_msix(struct irq_data *data) { pci_msix_mask(irq_data_get_msi_desc(data)); @@ -234,6 +284,8 @@ EXPORT_SYMBOL_GPL(pci_msix_prepare_desc); static const struct msi_domain_template pci_msix_template = { .chip = { .name = "PCI-MSIX", + .irq_startup = pci_irq_startup_msix, + .irq_shutdown = pci_irq_shutdown_msix, .irq_mask = pci_irq_mask_msix, .irq_unmask = pci_irq_unmask_msix, .irq_write_msi_msg = pci_msi_domain_write_msg, diff --git a/include/linux/msi.h b/include/linux/msi.h index e5e86a8529fb..3111ba95fbde 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -568,6 +568,8 @@ enum { MSI_FLAG_PARENT_PM_DEV = (1 << 8), /* Support for parent mask/unmask */ MSI_FLAG_PCI_MSI_MASK_PARENT = (1 << 9), + /* Support for parent startup/shutdown */ + MSI_FLAG_PCI_MSI_STARTUP_PARENT = (1 << 10), /* Mask for the generic functionality */ MSI_GENERIC_FLAGS_MASK = GENMASK(15, 0), -- cgit v1.2.3 From 92a96b0a227e91dc42475265a1ce766b6cd044fa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 17 Aug 2025 23:09:18 +0100 Subject: io_uring: add request poisoning Poison various request fields on free. __io_req_caches_free() is a slow path, so can be done unconditionally, but gate it on kasan for io_req_add_to_cache(). Note that some fields are logically retained between cache allocations and can't be poisoned in io_req_add_to_cache(). Ideally, it'd be replaced with KASAN'ed caches, but that can't be enabled because of some synchronisation nuances. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7a78e8a7f5be434313c400650b862e36c211b312.1755459452.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/poison.h | 3 +++ io_uring/io_uring.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/poison.h b/include/linux/poison.h index 8ca2235f78d5..299e2dd7da6d 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -90,4 +90,7 @@ /********** lib/stackdepot.c **********/ #define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) +/********** io_uring/ **********/ +#define IO_URING_PTR_POISON ((void *)(0x1091UL + POISON_POINTER_DELTA)) + #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 93633613a165..e511949086dd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -179,6 +179,26 @@ static const struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif +static void io_poison_cached_req(struct io_kiocb *req) +{ + req->ctx = IO_URING_PTR_POISON; + req->tctx = IO_URING_PTR_POISON; + req->file = IO_URING_PTR_POISON; + req->creds = IO_URING_PTR_POISON; + req->io_task_work.func = IO_URING_PTR_POISON; + req->apoll = IO_URING_PTR_POISON; +} + +static void io_poison_req(struct io_kiocb *req) +{ + io_poison_cached_req(req); + req->async_data = IO_URING_PTR_POISON; + req->kbuf = IO_URING_PTR_POISON; + req->comp_list.next = IO_URING_PTR_POISON; + req->file_node = IO_URING_PTR_POISON; + req->link = IO_URING_PTR_POISON; +} + static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -235,6 +255,8 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) { + if (IS_ENABLED(CONFIG_KASAN)) + io_poison_cached_req(req); wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); } @@ -2767,6 +2789,7 @@ static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); + io_poison_req(req); kmem_cache_free(req_cachep, req); nr++; } -- cgit v1.2.3 From 5fda51255439addd1c9059098e30847a375a1008 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Aug 2025 20:03:39 -0600 Subject: io_uring/kbuf: switch to storing struct io_buffer_list locally Currently the buffer list is stored in struct io_kiocb. The buffer list can be of two types: 1) Classic/legacy buffer list. These don't need to get referenced after a buffer pick, and hence storing them in struct io_kiocb is perfectly fine. 2) Ring provided buffer lists. These DO need to be referenced after the initial buffer pick, as they need to get consumed later on. This can be either just incrementing the head of the ring, or it can be consuming parts of a buffer if incremental buffer consumptions has been configured. For case 2, io_uring needs to be careful not to access the buffer list after the initial pick-and-execute context. The core does recycling of these, but it's easy to make a mistake, because it's stored in the io_kiocb which does persist across multiple execution contexts. Either because it's a multishot request, or simply because it needed some kind of async trigger (eg poll) for retry purposes. Add a struct io_buffer_list to struct io_br_sel, which is always on stack for the various users of it. This prevents the buffer list from leaking outside of that execution context, and additionally it enables kbuf to not even pass back the struct io_buffer_list if the given context isn't appropriately locked already. This doesn't fix any bugs, it's simply a defensive measure to prevent any issues with reuse of a buffer list. Link: https://lore.kernel.org/r/20250821020750.598432-12-axboe@kernel.dk Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 ------ io_uring/io_uring.c | 6 +++--- io_uring/kbuf.c | 27 ++++++++++++++----------- io_uring/kbuf.h | 16 ++++++--------- io_uring/net.c | 46 ++++++++++++++++-------------------------- io_uring/poll.c | 6 +++--- io_uring/rw.c | 22 ++++++++++---------- 7 files changed, 55 insertions(+), 74 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 80a178f3d896..1d33984611bc 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -674,12 +674,6 @@ struct io_kiocb { /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ struct io_buffer *kbuf; - /* - * stores buffer ID for ring provided buffers, valid IFF - * REQ_F_BUFFER_RING is set. - */ - struct io_buffer_list *buf_list; - struct io_rsrc_node *buf_node; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 27bc1486f07b..985b4681e513 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1007,7 +1007,7 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) lockdep_assert_held(&req->ctx->uring_lock); req_set_fail(req); - io_req_set_res(req, res, io_put_kbuf(req, res, req->buf_list)); + io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); if (def->fail) def->fail(req); io_req_complete_defer(req); @@ -2025,11 +2025,11 @@ fail: switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: - io_kbuf_recycle(req, req->buf_list, 0); + io_kbuf_recycle(req, NULL, 0); io_req_task_queue(req); break; case IO_APOLL_ABORTED: - io_kbuf_recycle(req, req->buf_list, 0); + io_kbuf_recycle(req, NULL, 0); io_queue_iowq(req); break; case IO_APOLL_OK: diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 21c12c437ab9..3e9aab21af9d 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -171,8 +171,8 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, if (*len == 0 || *len > buf->len) *len = buf->len; req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; - req->buf_list = bl; req->buf_index = buf->bid; + sel.buf_list = bl; sel.addr = u64_to_user_ptr(buf->addr); if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { @@ -186,8 +186,8 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ - io_kbuf_commit(req, bl, *len, 1); - req->buf_list = NULL; + io_kbuf_commit(req, sel.buf_list, *len, 1); + sel.buf_list = NULL; } return sel; } @@ -294,7 +294,6 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, req->flags |= REQ_F_BL_EMPTY; req->flags |= REQ_F_BUFFER_RING; - req->buf_list = bl; return iov - arg->iovs; } @@ -302,16 +301,15 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, struct io_br_sel *sel, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; int ret = -ENOENT; io_ring_submit_lock(ctx, issue_flags); - bl = io_buffer_get_list(ctx, arg->buf_group); - if (unlikely(!bl)) + sel->buf_list = io_buffer_get_list(ctx, arg->buf_group); + if (unlikely(!sel->buf_list)) goto out_unlock; - if (bl->flags & IOBL_BUF_RING) { - ret = io_ring_buffers_peek(req, arg, bl); + if (sel->buf_list->flags & IOBL_BUF_RING) { + ret = io_ring_buffers_peek(req, arg, sel->buf_list); /* * Don't recycle these buffers if we need to go through poll. * Nobody else can use them anyway, and holding on to provided @@ -321,13 +319,16 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, */ if (ret > 0) { req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; - io_kbuf_commit(req, bl, arg->out_len, ret); + io_kbuf_commit(req, sel->buf_list, arg->out_len, ret); } } else { - ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); + ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs); } out_unlock: - io_ring_submit_unlock(ctx, issue_flags); + if (issue_flags & IO_URING_F_UNLOCKED) { + sel->buf_list = NULL; + mutex_unlock(&ctx->uring_lock); + } return ret; } @@ -348,10 +349,12 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, ret = io_ring_buffers_peek(req, arg, bl); if (ret > 0) req->flags |= REQ_F_BUFFERS_COMMIT; + sel->buf_list = bl; return ret; } /* don't support multiple buffer selections for legacy */ + sel->buf_list = NULL; return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index b1723c2620da..1a539969fc9c 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -63,11 +63,14 @@ struct buf_sel_arg { }; /* - * Return value from io_buffer_list selection. Just returns the error or - * user address for now, will be extended to return the buffer list in the - * future. + * Return value from io_buffer_list selection, to avoid stashing it in + * struct io_kiocb. For legacy/classic provided buffers, keeping a reference + * across execution contexts are fine. But for ring provided buffers, the + * list may go away as soon as ->uring_lock is dropped. As the io_kiocb + * persists, it's better to just keep the buffer local for those cases. */ struct io_br_sel { + struct io_buffer_list *buf_list; /* * Some selection parts return the user address, others return an error. */ @@ -107,13 +110,6 @@ struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, static inline bool io_kbuf_recycle_ring(struct io_kiocb *req, struct io_buffer_list *bl) { - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - * The exception is partial io, that case we should increment bl->head - * to monopolize the buffer. - */ if (bl) { req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; diff --git a/io_uring/net.c b/io_uring/net.c index 4eb208d24169..b00cd2f59091 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -433,7 +433,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->opcode == IORING_OP_SENDMSG) return -EINVAL; sr->msg_flags |= MSG_WAITALL; - req->buf_list = NULL; req->flags |= REQ_F_MULTISHOT; } @@ -512,11 +511,11 @@ static inline bool io_send_finish(struct io_kiocb *req, unsigned int cflags; if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { - cflags = io_put_kbuf(req, sel->val, req->buf_list); + cflags = io_put_kbuf(req, sel->val, sel->buf_list); goto finish; } - cflags = io_put_kbufs(req, sel->val, req->buf_list, io_bundle_nbufs(kmsg, sel->val)); + cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val)); if (bundle_finished || req->flags & REQ_F_BL_EMPTY) goto finish; @@ -657,6 +656,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_bundle: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { ret = io_send_select_buffer(req, issue_flags, &sel, kmsg); if (ret) @@ -682,7 +682,7 @@ retry_bundle: sr->len -= ret; sr->buf += ret; sr->done_io += ret; - return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret); + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -795,18 +795,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; - if (req->flags & REQ_F_BUFFER_SELECT) { - /* - * Store the buffer group for this multishot receive separately, - * as if we end up doing an io-wq based issue that selects a - * buffer, it has to be committed immediately and that will - * clear ->buf_list. This means we lose the link to the buffer - * list, and the eventual buffer put on completion then cannot - * restore it. - */ + if (req->flags & REQ_F_BUFFER_SELECT) sr->buf_group = req->buf_index; - req->buf_list = NULL; - } sr->mshot_total_len = sr->mshot_len = 0; if (sr->flags & IORING_RECV_MULTISHOT) { if (!(req->flags & REQ_F_BUFFER_SELECT)) @@ -874,7 +864,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, if (sr->flags & IORING_RECVSEND_BUNDLE) { size_t this_ret = sel->val - sr->done_io; - cflags |= io_put_kbufs(req, this_ret, req->buf_list, io_bundle_nbufs(kmsg, this_ret)); + cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret)); if (sr->flags & IORING_RECV_RETRY) cflags = req->cqe.flags | (cflags & CQE_F_MASK); if (sr->mshot_len && sel->val >= sr->mshot_len) @@ -896,7 +886,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, return false; } } else { - cflags |= io_put_kbuf(req, sel->val, req->buf_list); + cflags |= io_put_kbuf(req, sel->val, sel->buf_list); } /* @@ -1038,6 +1028,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { size_t len = sr->len; @@ -1048,7 +1039,7 @@ retry_multishot: if (req->flags & REQ_F_APOLL_MULTISHOT) { ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len); if (ret) { - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); return ret; } } @@ -1072,14 +1063,12 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) - io_kbuf_recycle(req, req->buf_list, issue_flags); - + io_kbuf_recycle(req, sel.buf_list, issue_flags); return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret); + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1093,7 +1082,7 @@ retry_multishot: else if (sr->done_io) ret = sr->done_io; else - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); sel.val = ret; if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) @@ -1178,7 +1167,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; - struct io_br_sel sel = { }; + struct io_br_sel sel; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -1198,6 +1187,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) flags |= MSG_DONTWAIT; retry_multishot: + sel.buf_list = NULL; if (io_do_buffer_select(req)) { sel.val = sr->len; ret = io_recv_buf_select(req, kmsg, &sel, issue_flags); @@ -1217,16 +1207,14 @@ retry_multishot: ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) - io_kbuf_recycle(req, req->buf_list, issue_flags); - + io_kbuf_recycle(req, sel.buf_list, issue_flags); return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; - return io_net_kbuf_recyle(req, req->buf_list, kmsg, ret); + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1242,7 +1230,7 @@ out_free: else if (sr->done_io) ret = sr->done_io; else - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); sel.val = ret; if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) diff --git a/io_uring/poll.c b/io_uring/poll.c index 07ab22380c78..f3852bf7627b 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -316,10 +316,10 @@ void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) ret = io_poll_check_events(req, tw); if (ret == IOU_POLL_NO_ACTION) { - io_kbuf_recycle(req, req->buf_list, 0); + io_kbuf_recycle(req, NULL, 0); return; } else if (ret == IOU_POLL_REQUEUE) { - io_kbuf_recycle(req, req->buf_list, 0); + io_kbuf_recycle(req, NULL, 0); __io_poll_execute(req, 0); return; } @@ -686,7 +686,7 @@ int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask) req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, NULL, issue_flags); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); if (ret) diff --git a/io_uring/rw.c b/io_uring/rw.c index 2b106f644383..906e869d330a 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -579,7 +579,7 @@ void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) - req->cqe.flags |= io_put_kbuf(req, req->cqe.res, req->buf_list); + req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); io_req_rw_cleanup(req, 0); io_req_task_complete(req, tw); @@ -648,7 +648,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) } static int kiocb_done(struct io_kiocb *req, ssize_t ret, - unsigned int issue_flags) + struct io_br_sel *sel, unsigned int issue_flags) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned final_ret = io_fixup_rw_res(req, ret); @@ -662,7 +662,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, * from the submission path. */ io_req_io_end(req); - io_req_set_res(req, final_ret, io_put_kbuf(req, ret, req->buf_list)); + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, sel->buf_list)); io_req_rw_cleanup(req, issue_flags); return IOU_COMPLETE; } else { @@ -1024,10 +1024,10 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = __io_read(req, &sel, issue_flags); if (ret >= 0) - return kiocb_done(req, ret, issue_flags); + return kiocb_done(req, ret, &sel, issue_flags); if (req->flags & REQ_F_BUFFERS_COMMIT) - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); return ret; } @@ -1057,15 +1057,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * Reset rw->len to 0 again to avoid clamping future mshot * reads, in case the buffer size varies. */ - if (io_kbuf_recycle(req, req->buf_list, issue_flags)) + if (io_kbuf_recycle(req, sel.buf_list, issue_flags)) rw->len = 0; return IOU_RETRY; } else if (ret <= 0) { - io_kbuf_recycle(req, req->buf_list, issue_flags); + io_kbuf_recycle(req, sel.buf_list, issue_flags); if (ret < 0) req_set_fail(req); } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - cflags = io_put_kbuf(req, ret, req->buf_list); + cflags = io_put_kbuf(req, ret, sel.buf_list); } else { /* * Any successful return value will keep the multishot read @@ -1073,7 +1073,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * we fail to post a CQE, or multishot is no longer set, then * jump to the termination path. This request is then done. */ - cflags = io_put_kbuf(req, ret, req->buf_list); + cflags = io_put_kbuf(req, ret, sel.buf_list); rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { @@ -1202,7 +1202,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } done: - return kiocb_done(req, ret2, issue_flags); + return kiocb_done(req, ret2, NULL, issue_flags); } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); @@ -1370,7 +1370,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (!smp_load_acquire(&req->iopoll_completed)) break; nr_events++; - req->cqe.flags = io_put_kbuf(req, req->cqe.res, req->buf_list); + req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); if (req->opcode != IORING_OP_URING_CMD) io_req_rw_cleanup(req, 0); } -- cgit v1.2.3 From d589bcddaa3f8b1668499c3f0466863df3abe37a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 21 Aug 2025 12:02:06 +0800 Subject: io-uring: move `struct io_br_sel` into io_uring_types.h Move `struct io_br_sel` into io_uring_types.h and prepare for supporting provided buffer on uring_cmd. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250821040210.1152145-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 19 +++++++++++++++++++ io_uring/kbuf.h | 18 ------------------ 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1d33984611bc..9c6c548f43f5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -85,6 +85,25 @@ struct io_mapped_region { unsigned flags; }; +/* + * Return value from io_buffer_list selection, to avoid stashing it in + * struct io_kiocb. For legacy/classic provided buffers, keeping a reference + * across execution contexts are fine. But for ring provided buffers, the + * list may go away as soon as ->uring_lock is dropped. As the io_kiocb + * persists, it's better to just keep the buffer local for those cases. + */ +struct io_br_sel { + struct io_buffer_list *buf_list; + /* + * Some selection parts return the user address, others return an error. + */ + union { + void __user *addr; + ssize_t val; + }; +}; + + /* * Arbitrary limit, can be raised if need be */ diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 32f73adbe1e9..ada382ff38d7 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -62,24 +62,6 @@ struct buf_sel_arg { unsigned short partial_map; }; -/* - * Return value from io_buffer_list selection, to avoid stashing it in - * struct io_kiocb. For legacy/classic provided buffers, keeping a reference - * across execution contexts are fine. But for ring provided buffers, the - * list may go away as soon as ->uring_lock is dropped. As the io_kiocb - * persists, it's better to just keep the buffer local for those cases. - */ -struct io_br_sel { - struct io_buffer_list *buf_list; - /* - * Some selection parts return the user address, others return an error. - */ - union { - void __user *addr; - ssize_t val; - }; -}; - struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, unsigned buf_group, unsigned int issue_flags); int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, -- cgit v1.2.3 From 620a50c927004f5c9420a7ca9b1a55673dbf3941 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 21 Aug 2025 12:02:07 +0800 Subject: io_uring: uring_cmd: add multishot support Add UAPI flag IORING_URING_CMD_MULTISHOT for supporting multishot uring_cmd operations with provided buffer. This enables drivers to post multiple completion events from a single uring_cmd submission, which is useful for: - Notifying userspace of device events (e.g., interrupt handling) - Supporting devices with multiple event sources (e.g., multi-queue devices) - Avoiding the need for device poll() support when events originate from multiple sources device-wide The implementation adds two new APIs: - io_uring_cmd_select_buffer(): selects a buffer from the provided buffer group for multishot uring_cmd - io_uring_mshot_cmd_post_cqe(): posts a CQE after event data is pushed to the provided buffer Multishot uring_cmd must be used with buffer select (IOSQE_BUFFER_SELECT) and is mutually exclusive with IORING_URING_CMD_FIXED for now. The ublk driver will be the first user of this functionality: https://github.com/ming1/linux/commits/ublk-devel/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250821040210.1152145-3-ming.lei@redhat.com [axboe: fold in fix for !CONFIG_IO_URING] Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 26 ++++++++++++++++ include/uapi/linux/io_uring.h | 6 +++- io_uring/opdef.c | 1 + io_uring/uring_cmd.c | 71 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 102 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index cfa6d0c0c322..4bd3a7339243 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -70,6 +70,21 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, /* Execute the request from a blocking context */ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); +/* + * Select a buffer from the provided buffer group for multishot uring_cmd. + * Returns the selected buffer address and size. + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags); + +/* + * Complete a multishot uring_cmd event. This will post a CQE to the completion + * queue and update the provided buffer. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags); + #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -102,6 +117,17 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { } +static inline struct io_br_sel +io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, + size_t *len, unsigned int issue_flags) +{ + return (struct io_br_sel) { .val = -EOPNOTSUPP }; +} +static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + ssize_t ret, unsigned int issue_flags) +{ + return true; +} #endif /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6957dc539d83..1e935f8901c5 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -298,9 +298,13 @@ enum io_uring_op { * sqe->uring_cmd_flags top 8bits aren't available for userspace * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other + * multishot commands. Not compatible with + * IORING_URING_CMD_FIXED, for now. */ #define IORING_URING_CMD_FIXED (1U << 0) -#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED +#define IORING_URING_CMD_MULTISHOT (1U << 1) +#define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT) /* diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 9568785810d9..932319633eac 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -413,6 +413,7 @@ const struct io_issue_def io_issue_defs[] = { #endif }, [IORING_OP_URING_CMD] = { + .buffer_select = 1, .needs_file = 1, .plug = 1, .iopoll = 1, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 053bac89b6c0..3cfb5d51b88a 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -11,6 +11,7 @@ #include "io_uring.h" #include "alloc_cache.h" #include "rsrc.h" +#include "kbuf.h" #include "uring_cmd.h" #include "poll.h" @@ -194,8 +195,21 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ioucmd->flags & ~IORING_URING_CMD_MASK) return -EINVAL; - if (ioucmd->flags & IORING_URING_CMD_FIXED) + if (ioucmd->flags & IORING_URING_CMD_FIXED) { + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) + return -EINVAL; req->buf_index = READ_ONCE(sqe->buf_index); + } + + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { + if (ioucmd->flags & IORING_URING_CMD_FIXED) + return -EINVAL; + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + } else { + if (req->flags & REQ_F_BUFFER_SELECT) + return -EINVAL; + } ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); @@ -251,6 +265,10 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { + if (ret >= 0) + return IOU_ISSUE_SKIP_COMPLETE; + } if (ret == -EAGAIN) { ioucmd->flags |= IORING_URING_CMD_REISSUE; return ret; @@ -333,3 +351,54 @@ bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, return false; return io_req_post_cqe32(req, cqe); } + +/* + * Work with io_uring_mshot_cmd_post_cqe() together for committing the + * provided buffer upfront + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return (struct io_br_sel) { .val = -EINVAL }; + + if (WARN_ON_ONCE(!io_do_buffer_select(req))) + return (struct io_br_sel) { .val = -EINVAL }; + + return io_buffer_select(req, len, buf_group, issue_flags); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_buffer_select); + +/* + * Return true if this multishot uring_cmd needs to be completed, otherwise + * the event CQE is posted successfully. + * + * This function must use `struct io_br_sel` returned from + * io_uring_cmd_buffer_select() for committing the buffer in the same + * uring_cmd submission context. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + unsigned int cflags = 0; + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return true; + + if (sel->val > 0) { + cflags = io_put_kbuf(req, sel->val, sel->buf_list); + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) + return false; + } + + io_kbuf_recycle(req, sel->buf_list, issue_flags); + if (sel->val < 0) + req_set_fail(req); + io_req_set_res(req, sel->val, cflags); + return true; +} +EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe); -- cgit v1.2.3 From d0201c4436c53412146d526855c585fa9d54ca13 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:01:46 -0600 Subject: io_uring: remove io_ctx_cqe32() helper It's pretty pointless and only used for the tracing helper, get rid of it. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 ------ include/trace/events/io_uring.h | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9c6c548f43f5..d1e25f3fe0b3 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -740,10 +740,4 @@ struct io_overflow_cqe { struct list_head list; struct io_uring_cqe cqe; }; - -static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) -{ - return ctx->flags & IORING_SETUP_CQE32; -} - #endif diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 178ab6f611be..6a970625a3ea 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -340,8 +340,8 @@ TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe), __entry->user_data = cqe->user_data; __entry->res = cqe->res; __entry->cflags = cqe->flags; - __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; - __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; + __entry->extra1 = ctx->flags & IORING_SETUP_CQE32 ? cqe->big_cqe[0] : 0; + __entry->extra2 = ctx->flags & IORING_SETUP_CQE32 ? cqe->big_cqe[1] : 0; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " -- cgit v1.2.3 From 6e376f245f19feeadddafb2c3fa5fbd6469ecdfe Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:42 +0200 Subject: gpio: generic: provide to_gpio_generic_chip() Provide a helper allowing to convert a struct gpio_chip address to the struct gpio_generic_chip that wraps it. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-1-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/generic.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index f3a8db4598bb..5a85ecbef8d2 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -55,6 +55,12 @@ struct gpio_generic_chip { struct gpio_chip gc; }; +static inline struct gpio_generic_chip * +to_gpio_generic_chip(struct gpio_chip *gc) +{ + return container_of(gc, struct gpio_generic_chip, gc); +} + /** * gpio_generic_chip_init() - Initialize a generic GPIO chip. * @chip: Generic GPIO chip to set up. -- cgit v1.2.3 From 16397871b6e35fa46a2bec27b3558f93b050c6fc Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Aug 2025 11:48:43 +0200 Subject: gpio: generic: provide helpers for reading and writing registers Provide helpers wrapping the read_reg() and write_reg() callbacks of the generic GPIO API that are called directly by many users. This is done to hide their implementation ahead of moving them into the separate generic GPIO struct. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250825-gpio-mmio-gpio-conv-v1-2-356b4b1d5110@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/generic.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index 5a85ecbef8d2..4c0626b53ec9 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -100,6 +100,37 @@ gpio_generic_chip_set(struct gpio_generic_chip *chip, unsigned int offset, return chip->gc.set(&chip->gc, offset, value); } +/** + * gpio_generic_read_reg() - Read a register using the underlying callback. + * @chip: Generic GPIO chip to use. + * @reg: Register to read. + * + * Returns: value read from register. + */ +static inline unsigned long +gpio_generic_read_reg(struct gpio_generic_chip *chip, void __iomem *reg) +{ + if (WARN_ON(!chip->gc.read_reg)) + return 0; + + return chip->gc.read_reg(reg); +} + +/** + * gpio_generic_write_reg() - Write a register using the underlying callback. + * @chip: Generic GPIO chip to use. + * @reg: Register to write to. + * @val: New value to write. + */ +static inline void gpio_generic_write_reg(struct gpio_generic_chip *chip, + void __iomem *reg, unsigned long val) +{ + if (WARN_ON(!chip->gc.write_reg)) + return; + + chip->gc.write_reg(reg, val); +} + #define gpio_generic_chip_lock(gen_gc) \ raw_spin_lock(&(gen_gc)->gc.bgpio_lock) -- cgit v1.2.3 From 60ad9a07319283e6e1094cef3e972e754315c024 Mon Sep 17 00:00:00 2001 From: Junjie Cao Date: Wed, 20 Aug 2025 08:47:55 +0800 Subject: iio: core: switch info_mask fields to unsigned long to match find_bit helpers for_each_set_bit()/find_*_bit() expect arrays of unsigned long (see include/linux/find.h), but industrialio-core passed const long * into iio_device_add_info_mask_type{,_avail}(). These masks are used purely as bit arrays and are populated via BIT() (1UL << n). Switch the info_mask_* fields and the corresponding function parameters to unsigned long so the types match the helpers. This removes sparse warnings about signedness mismatches (seen with 'make C=1' CF='-Wsparse-all') without changing behavior or struct layout. No functional change intended. Suggested-by: Jonathan Cameron Signed-off-by: Junjie Cao Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20250820004755.69627-1-junjie.cao@intel.com Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 4 ++-- include/linux/iio/iio.h | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index eb6a54f8115d..38848d753a33 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1244,7 +1244,7 @@ static int iio_device_add_channel_label(struct iio_dev *indio_dev, static int iio_device_add_info_mask_type(struct iio_dev *indio_dev, struct iio_chan_spec const *chan, enum iio_shared_by shared_by, - const long *infomask) + const unsigned long *infomask) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int i, ret, attrcount = 0; @@ -1274,7 +1274,7 @@ static int iio_device_add_info_mask_type(struct iio_dev *indio_dev, static int iio_device_add_info_mask_type_avail(struct iio_dev *indio_dev, struct iio_chan_spec const *chan, enum iio_shared_by shared_by, - const long *infomask) + const unsigned long *infomask) { struct iio_dev_opaque *iio_dev_opaque = to_iio_dev_opaque(indio_dev); int i, ret, attrcount = 0; diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 2f5560646ee4..872ebdf0dd77 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -271,14 +271,14 @@ struct iio_chan_spec { unsigned int num_ext_scan_type; }; }; - long info_mask_separate; - long info_mask_separate_available; - long info_mask_shared_by_type; - long info_mask_shared_by_type_available; - long info_mask_shared_by_dir; - long info_mask_shared_by_dir_available; - long info_mask_shared_by_all; - long info_mask_shared_by_all_available; + unsigned long info_mask_separate; + unsigned long info_mask_separate_available; + unsigned long info_mask_shared_by_type; + unsigned long info_mask_shared_by_type_available; + unsigned long info_mask_shared_by_dir; + unsigned long info_mask_shared_by_dir_available; + unsigned long info_mask_shared_by_all; + unsigned long info_mask_shared_by_all_available; const struct iio_event_spec *event_spec; unsigned int num_event_specs; const struct iio_chan_spec_ext_info *ext_info; -- cgit v1.2.3 From 7a6fc1634cea6f220228a69b1c0210e6b8b1aaf0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:45 -0700 Subject: blk-mq-dma: create blk_map_iter type The req_iterator happens to have a similar fields to what the dma iterator needs, but we're not necessarily iterating a request's bi_io_vec. Create a new type that can be amended for additional future use. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-2-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 4 ++-- include/linux/blk-mq-dma.h | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index ad283017caef..51e7a0ff045f 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -10,7 +10,7 @@ struct phys_vec { u32 len; }; -static bool blk_map_iter_next(struct request *req, struct req_iterator *iter, +static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct phys_vec *vec) { unsigned int max_size; @@ -246,7 +246,7 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { - struct req_iterator iter = { + struct blk_map_iter iter = { .bio = rq->bio, }; struct phys_vec vec; diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index c26a01aeae00..6a7e3828673d 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -5,6 +5,11 @@ #include #include +struct blk_map_iter { + struct bvec_iter iter; + struct bio *bio; +}; + struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; @@ -14,7 +19,7 @@ struct blk_dma_iter { blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ - struct req_iterator iter; + struct blk_map_iter iter; struct pci_p2pdma_map_state p2pdma; }; -- cgit v1.2.3 From dae75dead2359edd7c55e1964e0edf7d03535b31 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:46 -0700 Subject: blk-mq-dma: provide the bio_vec array being iterated This will make it easier to add different sources of the bvec array, like for upcoming integrity support, rather than assume to use the bio's bi_io_vec. It also makes iterating "special" payloads more in common with iterating normal payloads. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-3-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 56 +++++++++++++++++++++++++++------------------- include/linux/blk-mq-dma.h | 1 + 2 files changed, 34 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 51e7a0ff045f..8f41fe740b46 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -16,23 +16,14 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, unsigned int max_size; struct bio_vec bv; - if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { - if (!iter->bio) - return false; - vec->paddr = bvec_phys(&req->special_vec); - vec->len = req->special_vec.bv_len; - iter->bio = NULL; - return true; - } - if (!iter->iter.bi_size) return false; - bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + bv = mp_bvec_iter_bvec(iter->bvecs, iter->iter); vec->paddr = bvec_phys(&bv); max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); bv.bv_len = min(bv.bv_len, max_size); - bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); + bvec_iter_advance_single(iter->bvecs, &iter->iter, bv.bv_len); /* * If we are entirely done with this bi_io_vec entry, check if the next @@ -43,19 +34,20 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct bio_vec next; if (!iter->iter.bi_size) { - if (!iter->bio->bi_next) + if (!iter->bio || !iter->bio->bi_next) break; iter->bio = iter->bio->bi_next; iter->iter = iter->bio->bi_iter; + iter->bvecs = iter->bio->bi_io_vec; } - next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + next = mp_bvec_iter_bvec(iter->bvecs, iter->iter); if (bv.bv_len + next.bv_len > max_size || !biovec_phys_mergeable(req->q, &bv, &next)) break; bv.bv_len += next.bv_len; - bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + bvec_iter_advance_single(iter->bvecs, &iter->iter, next.bv_len); } vec->len = bv.bv_len; @@ -125,6 +117,30 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, return true; } +static inline void blk_rq_map_iter_init(struct request *rq, + struct blk_map_iter *iter) +{ + struct bio *bio = rq->bio; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { + *iter = (struct blk_map_iter) { + .bvecs = &rq->special_vec, + .iter = { + .bi_size = rq->special_vec.bv_len, + } + }; + } else if (bio) { + *iter = (struct blk_map_iter) { + .bio = bio, + .bvecs = bio->bi_io_vec, + .iter = bio->bi_iter, + }; + } else { + /* the internal flush request may not have bio attached */ + *iter = (struct blk_map_iter) {}; + } +} + /** * blk_rq_dma_map_iter_start - map the first DMA segment for a request * @req: request to map @@ -153,8 +169,7 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, unsigned int total_len = blk_rq_payload_bytes(req); struct phys_vec vec; - iter->iter.bio = req->bio; - iter->iter.iter = req->bio->bi_iter; + blk_rq_map_iter_init(req, &iter->iter); memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); iter->status = BLK_STS_OK; @@ -246,16 +261,11 @@ blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { - struct blk_map_iter iter = { - .bio = rq->bio, - }; + struct blk_map_iter iter; struct phys_vec vec; int nsegs = 0; - /* the internal flush request may not have bio attached */ - if (iter.bio) - iter.iter = iter.bio->bi_iter; - + blk_rq_map_iter_init(rq, &iter); while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 6a7e3828673d..e5cb5e46fc92 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -8,6 +8,7 @@ struct blk_map_iter { struct bvec_iter iter; struct bio *bio; + struct bio_vec *bvecs; }; struct blk_dma_iter { -- cgit v1.2.3 From 92fb75fd14b041038e30bc725ab4c1e625243573 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:47 -0700 Subject: blk-mq-dma: require unmap caller provide p2p map type In preparing for integrity dma mappings, we can't rely on the request flag because data and metadata may have different mapping types. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-4-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 9 ++++++++- include/linux/blk-mq-dma.h | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 2c6d9506b172..111b6bc6c93e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -261,6 +261,9 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, + + /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ + IOD_P2P_BUS_ADDR = 1U << 3, }; struct nvme_dma_vec { @@ -725,7 +728,8 @@ static void nvme_unmap_data(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, + iod->flags & IOD_P2P_BUS_ADDR)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) nvme_free_sgls(req); else @@ -1000,6 +1004,9 @@ static blk_status_t nvme_map_data(struct request *req) if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) return iter.status; + if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + iod->flags |= IOD_P2P_BUS_ADDR; + if (use_sgl == SGL_FORCED || (use_sgl == SGL_SUPPORTED && (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index e5cb5e46fc92..881880095e0d 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -47,14 +47,15 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap + * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) + struct dma_iova_state *state, size_t mapped_len, bool is_p2p) { - if (req->cmd_flags & REQ_P2PDMA) + if (is_p2p) return true; if (dma_use_iova(state)) { -- cgit v1.2.3 From 7092639031a1bd5320ab827e8f665350f332b7ce Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:48 -0700 Subject: blk-mq: remove REQ_P2PDMA flag It's not serving any particular purpose. pci_p2pdma_state() already has all the appropriate checks, so the config and flag checks are not guarding anything. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-5-kbusch@meta.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-mq-dma.c | 30 ++++++++++++++---------------- include/linux/blk_types.h | 2 -- 3 files changed, 15 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 3b371a5da159..44c43b970387 100644 --- a/block/bio.c +++ b/block/bio.c @@ -981,7 +981,7 @@ void __bio_add_page(struct bio *bio, struct page *page, WARN_ON_ONCE(bio_full(bio, len)); if (is_pci_p2pdma_page(page)) - bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE; + bio->bi_opf |= REQ_NOMERGE; bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 8f41fe740b46..58defab21882 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -180,22 +180,20 @@ bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, if (!blk_map_iter_next(req, &iter->iter, &vec)) return false; - if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { - switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, - phys_to_page(vec.paddr))) { - case PCI_P2PDMA_MAP_BUS_ADDR: - return blk_dma_map_bus(iter, &vec); - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - /* - * P2P transfers through the host bridge are treated the - * same as non-P2P transfers below and during unmap. - */ - req->cmd_flags &= ~REQ_P2PDMA; - break; - default: - iter->status = BLK_STS_INVAL; - return false; - } + switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, + phys_to_page(vec.paddr))) { + case PCI_P2PDMA_MAP_BUS_ADDR: + return blk_dma_map_bus(iter, &vec); + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + /* + * P2P transfers through the host bridge are treated the + * same as non-P2P transfers below and during unmap. + */ + case PCI_P2PDMA_MAP_NONE: + break; + default: + iter->status = BLK_STS_INVAL; + return false; } if (blk_can_dma_map_iova(req, dma_dev) && diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 09b99d52fd36..930daff207df 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -386,7 +386,6 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ - __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -419,7 +418,6 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) -#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) -- cgit v1.2.3 From fec9b16dc5550191fd85af118271ea00e8dcc5f8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Aug 2025 08:31:50 -0700 Subject: blk-mq-dma: add scatter-less integrity data DMA mapping Similar to regular data, introduce more efficient integrity mapping helpers that does away with the scatterlist structure. This uses the block mapping iterator to add IOVA segments if IOMMU is enabled, or maps directly if not. This also supports P2P segements if integrity data ever wants to allocate that type of memory. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250813153153.3260897-7-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 104 +++++++++++++++++++++++++++++++++++++++--- include/linux/blk-integrity.h | 17 +++++++ include/linux/blk-mq-dma.h | 1 + 3 files changed, 115 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 31dd8f58f081..60a244a129c3 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2025 Christoph Hellwig */ +#include #include #include "blk.h" @@ -10,6 +11,24 @@ struct phys_vec { u32 len; }; +static bool __blk_map_iter_next(struct blk_map_iter *iter) +{ + if (iter->iter.bi_size) + return true; + if (!iter->bio || !iter->bio->bi_next) + return false; + + iter->bio = iter->bio->bi_next; + if (iter->is_integrity) { + iter->iter = bio_integrity(iter->bio)->bip_iter; + iter->bvecs = bio_integrity(iter->bio)->bip_vec; + } else { + iter->iter = iter->bio->bi_iter; + iter->bvecs = iter->bio->bi_io_vec; + } + return true; +} + static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, struct phys_vec *vec) { @@ -33,13 +52,8 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter, while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { struct bio_vec next; - if (!iter->iter.bi_size) { - if (!iter->bio || !iter->bio->bi_next) - break; - iter->bio = iter->bio->bi_next; - iter->iter = iter->bio->bi_iter; - iter->bvecs = iter->bio->bi_io_vec; - } + if (!__blk_map_iter_next(iter)) + break; next = mp_bvec_iter_bvec(iter->bvecs, iter->iter); if (bv.bv_len + next.bv_len > max_size || @@ -290,3 +304,79 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, return nsegs; } EXPORT_SYMBOL(__blk_rq_map_sg); + +#ifdef CONFIG_BLK_DEV_INTEGRITY +/** + * blk_rq_integrity_dma_map_iter_start - map the first integrity DMA segment + * for a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Start DMA mapping @req integrity data to @dma_dev. @state and @iter are + * provided by the caller and don't need to be initialized. @state needs to be + * stored for use at unmap time, @iter is only needed at map time. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true if it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr + * and the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + * + * The caller can call blk_rq_dma_map_coalesce() to check if further segments + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() + * to try to map the following segments. + */ +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + unsigned len = bio_integrity_bytes(&req->q->limits.integrity, + blk_rq_sectors(req)); + struct bio *bio = req->bio; + + iter->iter = (struct blk_map_iter) { + .bio = bio, + .iter = bio_integrity(bio)->bip_iter, + .bvecs = bio_integrity(bio)->bip_vec, + .is_integrity = true, + }; + return blk_dma_map_iter_start(req, dma_dev, state, iter, len); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_start); + +/** + * blk_rq_integrity_dma_map_iter_start - map the next integrity DMA segment for + * a request + * @req: request to map + * @dma_dev: device to map to + * @state: DMA IOVA state + * @iter: block layer DMA iterator + * + * Iterate to the next integrity mapping after a previous call to + * blk_rq_integrity_dma_map_iter_start(). See there for a detailed description + * of the arguments. + * + * Returns %false if there is no segment to map, including due to an error, or + * %true if it did map a segment. + * + * If a segment was mapped, the DMA address for it is returned in @iter.addr and + * the length in @iter.len. If no segment was mapped the status code is + * returned in @iter.status. + */ +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + struct phys_vec vec; + + if (!blk_map_iter_next(req, &iter->iter, &vec)) + return false; + + if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) + return blk_dma_map_bus(iter, &vec); + return blk_dma_map_direct(req, dma_dev, iter, &vec); +} +EXPORT_SYMBOL_GPL(blk_rq_integrity_dma_map_iter_next); +#endif diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e67a2b6e8f11..78fe2459e661 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -4,6 +4,7 @@ #include #include +#include struct request; @@ -31,6 +32,11 @@ int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); int blk_get_meta_cap(struct block_device *bdev, unsigned int cmd, struct logical_block_metadata_cap __user *argp); +bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter); +bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -115,6 +121,17 @@ static inline int blk_rq_integrity_map_user(struct request *rq, { return -EINVAL; } +static inline bool blk_rq_integrity_dma_map_iter_start(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + struct blk_dma_iter *iter) +{ + return false; +} +static inline bool blk_rq_integrity_dma_map_iter_next(struct request *req, + struct device *dma_dev, struct blk_dma_iter *iter) +{ + return false; +} static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) { return NULL; diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 881880095e0d..0f45ea110ca1 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -9,6 +9,7 @@ struct blk_map_iter { struct bvec_iter iter; struct bio *bio; struct bio_vec *bvecs; + bool is_integrity; }; struct blk_dma_iter { -- cgit v1.2.3 From a214365140cc3009f07d4e14a8b481fd3dc41d31 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 10 Jul 2025 15:15:28 +0300 Subject: rculist: move list_for_each_rcu() to where it belongs The list_for_each_rcu() relies on the rcu_dereference() API which is not provided by the list.h. At the same time list.h is a low-level basic header that must not have dependencies like RCU, besides the fact of the potential circular dependencies in some cases. With all that said, move RCU related API to the rculist.h where it belongs. Signed-off-by: Andy Shevchenko Reviewed-by: Simona Vetter Reviewed-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) Signed-off-by: "Paul E. McKenney" --- include/linux/list.h | 10 ---------- include/linux/rculist.h | 10 ++++++++++ kernel/cgroup/dmem.c | 1 + 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index e7e28afd28f8..e7bdad9b8618 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -686,16 +686,6 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) -/** - * list_for_each_rcu - Iterate over a list in an RCU-safe fashion - * @pos: the &struct list_head to use as a loop cursor. - * @head: the head for your list. - */ -#define list_for_each_rcu(pos, head) \ - for (pos = rcu_dereference((head)->next); \ - !list_is_head(pos, (head)); \ - pos = rcu_dereference(pos->next)) - /** * list_for_each_continue - continue iteration over a list * @pos: the &struct list_head to use as a loop cursor. diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 1b11926ddd47..2abba7552605 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -42,6 +42,16 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) */ #define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev))) +/** + * list_for_each_rcu - Iterate over a list in an RCU-safe fashion + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_rcu(pos, head) \ + for (pos = rcu_dereference((head)->next); \ + !list_is_head(pos, (head)); \ + pos = rcu_dereference(pos->next)) + /** * list_tail_rcu - returns the prev pointer of the head of the list * @head: the head of the list diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c index 10b63433f057..e12b946278b6 100644 --- a/kernel/cgroup/dmem.c +++ b/kernel/cgroup/dmem.c @@ -14,6 +14,7 @@ #include #include #include +#include #include struct dmem_cgroup_region { -- cgit v1.2.3 From f45fc18b6de04483643e8aa2ab97737abfe03d59 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 23 Aug 2025 09:56:03 +0200 Subject: net: airoha: Add airoha_ppe_dev struct definition Introduce airoha_ppe_dev struct as container for PPE offload callbacks consumed by the MT76 driver during flowtable offload for traffic received by the wlan NIC and forwarded to the wired one. Add airoha_ppe_setup_tc_block_cb routine to PPE offload ops for MT76 driver. Rely on airoha_ppe_dev pointer in airoha_ppe_setup_tc_block_cb signature. Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250823-airoha-en7581-wlan-rx-offload-v3-2-f78600ec3ed8@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 4 +- drivers/net/ethernet/airoha/airoha_eth.h | 4 +- drivers/net/ethernet/airoha/airoha_npu.c | 1 - drivers/net/ethernet/airoha/airoha_ppe.c | 67 +++++++++++++++++++++++++++++-- include/linux/soc/airoha/airoha_offload.h | 35 ++++++++++++++++ 5 files changed, 104 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index e6b802e3d844..5a04f90dd3de 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -2599,13 +2599,15 @@ static int airoha_dev_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) { struct net_device *dev = cb_priv; + struct airoha_gdm_port *port = netdev_priv(dev); + struct airoha_eth *eth = port->qdma->eth; if (!tc_can_offload(dev)) return -EOPNOTSUPP; switch (type) { case TC_SETUP_CLSFLOWER: - return airoha_ppe_setup_tc_block_cb(dev, type_data); + return airoha_ppe_setup_tc_block_cb(ð->ppe->dev, type_data); case TC_SETUP_CLSMATCHALL: return airoha_dev_tc_matchall(dev, type_data); default: diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index 9f721e2b972f..9060b1d2814e 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #define AIROHA_MAX_NUM_GDM_PORTS 4 @@ -546,6 +547,7 @@ struct airoha_gdm_port { #define AIROHA_RXD4_FOE_ENTRY GENMASK(15, 0) struct airoha_ppe { + struct airoha_ppe_dev dev; struct airoha_eth *eth; void *foe; @@ -622,7 +624,7 @@ bool airoha_is_valid_gdm_port(struct airoha_eth *eth, void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, u16 hash); -int airoha_ppe_setup_tc_block_cb(struct net_device *dev, void *type_data); +int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, void *type_data); int airoha_ppe_init(struct airoha_eth *eth); void airoha_ppe_deinit(struct airoha_eth *eth); void airoha_ppe_init_upd_mem(struct airoha_gdm_port *port); diff --git a/drivers/net/ethernet/airoha/airoha_npu.c b/drivers/net/ethernet/airoha/airoha_npu.c index 1a6b191ae0b0..e1d131d6115c 100644 --- a/drivers/net/ethernet/airoha/airoha_npu.c +++ b/drivers/net/ethernet/airoha/airoha_npu.c @@ -11,7 +11,6 @@ #include #include #include -#include #include "airoha_eth.h" diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 36b45e98279a..03d9b1f24bb3 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -6,8 +6,9 @@ #include #include +#include +#include #include -#include #include #include @@ -1282,10 +1283,10 @@ error_npu_put: return err; } -int airoha_ppe_setup_tc_block_cb(struct net_device *dev, void *type_data) +int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, void *type_data) { - struct airoha_gdm_port *port = netdev_priv(dev); - struct airoha_eth *eth = port->qdma->eth; + struct airoha_ppe *ppe = dev->priv; + struct airoha_eth *eth = ppe->eth; int err = 0; mutex_lock(&flow_offload_mutex); @@ -1338,6 +1339,61 @@ void airoha_ppe_init_upd_mem(struct airoha_gdm_port *port) PPE_UPDMEM_WR_MASK | PPE_UPDMEM_REQ_MASK); } +struct airoha_ppe_dev *airoha_ppe_get_dev(struct device *dev) +{ + struct platform_device *pdev; + struct device_node *np; + struct airoha_eth *eth; + + np = of_parse_phandle(dev->of_node, "airoha,eth", 0); + if (!np) + return ERR_PTR(-ENODEV); + + pdev = of_find_device_by_node(np); + if (!pdev) { + dev_err(dev, "cannot find device node %s\n", np->name); + of_node_put(np); + return ERR_PTR(-ENODEV); + } + of_node_put(np); + + if (!try_module_get(THIS_MODULE)) { + dev_err(dev, "failed to get the device driver module\n"); + goto error_pdev_put; + } + + eth = platform_get_drvdata(pdev); + if (!eth) + goto error_module_put; + + if (!device_link_add(dev, &pdev->dev, DL_FLAG_AUTOREMOVE_SUPPLIER)) { + dev_err(&pdev->dev, + "failed to create device link to consumer %s\n", + dev_name(dev)); + goto error_module_put; + } + + return ð->ppe->dev; + +error_module_put: + module_put(THIS_MODULE); +error_pdev_put: + platform_device_put(pdev); + + return ERR_PTR(-ENODEV); +} +EXPORT_SYMBOL_GPL(airoha_ppe_get_dev); + +void airoha_ppe_put_dev(struct airoha_ppe_dev *dev) +{ + struct airoha_ppe *ppe = dev->priv; + struct airoha_eth *eth = ppe->eth; + + module_put(THIS_MODULE); + put_device(eth->dev); +} +EXPORT_SYMBOL_GPL(airoha_ppe_put_dev); + int airoha_ppe_init(struct airoha_eth *eth) { struct airoha_ppe *ppe; @@ -1347,6 +1403,9 @@ int airoha_ppe_init(struct airoha_eth *eth) if (!ppe) return -ENOMEM; + ppe->dev.ops.setup_tc_block_cb = airoha_ppe_setup_tc_block_cb; + ppe->dev.priv = ppe; + foe_size = PPE_NUM_ENTRIES * sizeof(struct airoha_foe_entry); ppe->foe = dmam_alloc_coherent(eth->dev, foe_size, &ppe->foe_dma, GFP_KERNEL); diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index 117c63c2448d..4b4b8b9e426d 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -9,6 +9,41 @@ #include #include +struct airoha_ppe_dev { + struct { + int (*setup_tc_block_cb)(struct airoha_ppe_dev *dev, + void *type_data); + } ops; + + void *priv; +}; + +#if (IS_BUILTIN(CONFIG_NET_AIROHA) || IS_MODULE(CONFIG_NET_AIROHA)) +struct airoha_ppe_dev *airoha_ppe_get_dev(struct device *dev); +void airoha_ppe_put_dev(struct airoha_ppe_dev *dev); + +static inline int airoha_ppe_dev_setup_tc_block_cb(struct airoha_ppe_dev *dev, + void *type_data) +{ + return dev->ops.setup_tc_block_cb(dev, type_data); +} +#else +static inline struct airoha_ppe_dev *airoha_ppe_get_dev(struct device *dev) +{ + return NULL; +} + +static inline void airoha_ppe_put_dev(struct airoha_ppe_dev *dev) +{ +} + +static inline int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, + void *type_data) +{ + return -EOPNOTSUPP; +} +#endif + #define NPU_NUM_CORES 8 #define NPU_NUM_IRQ 6 #define NPU_RX0_DESC_NUM 512 -- cgit v1.2.3 From a7cc1aa151e3a9c0314b995f06102f7763d3bd71 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 23 Aug 2025 09:56:04 +0200 Subject: net: airoha: Introduce check_skb callback in ppe_dev ops Export airoha_ppe_check_skb routine in ppe_dev ops. check_skb callback will be used by the MT76 driver in order to offload the traffic received by the wlan NIC and forwarded to the ethernet one. Add rx_wlan parameter to airoha_ppe_check_skb routine signature. Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250823-airoha-en7581-wlan-rx-offload-v3-3-f78600ec3ed8@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 3 ++- drivers/net/ethernet/airoha/airoha_eth.h | 8 ++------ drivers/net/ethernet/airoha/airoha_ppe.c | 25 ++++++++++++++----------- include/linux/soc/airoha/airoha_offload.h | 20 ++++++++++++++++++++ 4 files changed, 38 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 5a04f90dd3de..81ea01a652b9 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -698,7 +698,8 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget) reason = FIELD_GET(AIROHA_RXD4_PPE_CPU_REASON, msg1); if (reason == PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED) - airoha_ppe_check_skb(eth->ppe, q->skb, hash); + airoha_ppe_check_skb(ð->ppe->dev, q->skb, hash, + false); done++; napi_gro_receive(&q->napi, q->skb); diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index 9060b1d2814e..77fd13d466dc 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -229,10 +229,6 @@ struct airoha_hw_stats { u64 rx_len[7]; }; -enum { - PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED = 0x0f, -}; - enum { AIROHA_FOE_STATE_INVALID, AIROHA_FOE_STATE_UNBIND, @@ -622,8 +618,8 @@ static inline bool airhoa_is_lan_gdm_port(struct airoha_gdm_port *port) bool airoha_is_valid_gdm_port(struct airoha_eth *eth, struct airoha_gdm_port *port); -void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, - u16 hash); +void airoha_ppe_check_skb(struct airoha_ppe_dev *dev, struct sk_buff *skb, + u16 hash, bool rx_wlan); int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, void *type_data); int airoha_ppe_init(struct airoha_eth *eth); void airoha_ppe_deinit(struct airoha_eth *eth); diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 03d9b1f24bb3..78473527ff50 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -616,7 +616,7 @@ static bool airoha_ppe_foe_compare_entry(struct airoha_flow_table_entry *e, static int airoha_ppe_foe_commit_entry(struct airoha_ppe *ppe, struct airoha_foe_entry *e, - u32 hash) + u32 hash, bool rx_wlan) { struct airoha_foe_entry *hwe = ppe->foe + hash * sizeof(*hwe); u32 ts = airoha_ppe_get_timestamp(ppe); @@ -639,7 +639,8 @@ static int airoha_ppe_foe_commit_entry(struct airoha_ppe *ppe, goto unlock; } - airoha_ppe_foe_flow_stats_update(ppe, npu, hwe, hash); + if (!rx_wlan) + airoha_ppe_foe_flow_stats_update(ppe, npu, hwe, hash); if (hash < PPE_SRAM_NUM_ENTRIES) { dma_addr_t addr = ppe->foe_dma + hash * sizeof(*hwe); @@ -665,7 +666,7 @@ static void airoha_ppe_foe_remove_flow(struct airoha_ppe *ppe, e->data.ib1 &= ~AIROHA_FOE_IB1_BIND_STATE; e->data.ib1 |= FIELD_PREP(AIROHA_FOE_IB1_BIND_STATE, AIROHA_FOE_STATE_INVALID); - airoha_ppe_foe_commit_entry(ppe, &e->data, e->hash); + airoha_ppe_foe_commit_entry(ppe, &e->data, e->hash, false); e->hash = 0xffff; } if (e->type == FLOW_TYPE_L2_SUBFLOW) { @@ -704,7 +705,7 @@ static void airoha_ppe_foe_flow_remove_entry(struct airoha_ppe *ppe, static int airoha_ppe_foe_commit_subflow_entry(struct airoha_ppe *ppe, struct airoha_flow_table_entry *e, - u32 hash) + u32 hash, bool rx_wlan) { u32 mask = AIROHA_FOE_IB1_BIND_PACKET_TYPE | AIROHA_FOE_IB1_BIND_UDP; struct airoha_foe_entry *hwe_p, hwe; @@ -745,14 +746,14 @@ airoha_ppe_foe_commit_subflow_entry(struct airoha_ppe *ppe, } hwe.bridge.data = e->data.bridge.data; - airoha_ppe_foe_commit_entry(ppe, &hwe, hash); + airoha_ppe_foe_commit_entry(ppe, &hwe, hash, rx_wlan); return 0; } static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, struct sk_buff *skb, - u32 hash) + u32 hash, bool rx_wlan) { struct airoha_flow_table_entry *e; struct airoha_foe_bridge br = {}; @@ -785,7 +786,7 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, if (!airoha_ppe_foe_compare_entry(e, hwe)) continue; - airoha_ppe_foe_commit_entry(ppe, &e->data, hash); + airoha_ppe_foe_commit_entry(ppe, &e->data, hash, rx_wlan); commit_done = true; e->hash = hash; } @@ -797,7 +798,7 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, e = rhashtable_lookup_fast(&ppe->l2_flows, &br, airoha_l2_flow_table_params); if (e) - airoha_ppe_foe_commit_subflow_entry(ppe, e, hash); + airoha_ppe_foe_commit_subflow_entry(ppe, e, hash, rx_wlan); unlock: spin_unlock_bh(&ppe_lock); } @@ -1301,9 +1302,10 @@ int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, void *type_data) return err; } -void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, - u16 hash) +void airoha_ppe_check_skb(struct airoha_ppe_dev *dev, struct sk_buff *skb, + u16 hash, bool rx_wlan) { + struct airoha_ppe *ppe = dev->priv; u16 now, diff; if (hash > PPE_HASH_MASK) @@ -1315,7 +1317,7 @@ void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, return; ppe->foe_check_time[hash] = now; - airoha_ppe_foe_insert_entry(ppe, skb, hash); + airoha_ppe_foe_insert_entry(ppe, skb, hash, rx_wlan); } void airoha_ppe_init_upd_mem(struct airoha_gdm_port *port) @@ -1404,6 +1406,7 @@ int airoha_ppe_init(struct airoha_eth *eth) return -ENOMEM; ppe->dev.ops.setup_tc_block_cb = airoha_ppe_setup_tc_block_cb; + ppe->dev.ops.check_skb = airoha_ppe_check_skb; ppe->dev.priv = ppe; foe_size = PPE_NUM_ENTRIES * sizeof(struct airoha_foe_entry); diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index 4b4b8b9e426d..1dc5b4e35ef9 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -9,10 +9,17 @@ #include #include +enum { + PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED = 0x0f, +}; + struct airoha_ppe_dev { struct { int (*setup_tc_block_cb)(struct airoha_ppe_dev *dev, void *type_data); + void (*check_skb)(struct airoha_ppe_dev *dev, + struct sk_buff *skb, u16 hash, + bool rx_wlan); } ops; void *priv; @@ -27,6 +34,13 @@ static inline int airoha_ppe_dev_setup_tc_block_cb(struct airoha_ppe_dev *dev, { return dev->ops.setup_tc_block_cb(dev, type_data); } + +static inline void airoha_ppe_dev_check_skb(struct airoha_ppe_dev *dev, + struct sk_buff *skb, + u16 hash, bool rx_wlan) +{ + dev->ops.check_skb(dev, skb, hash, rx_wlan); +} #else static inline struct airoha_ppe_dev *airoha_ppe_get_dev(struct device *dev) { @@ -42,6 +56,12 @@ static inline int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, { return -EOPNOTSUPP; } + +static inline void airoha_ppe_dev_check_skb(struct airoha_ppe_dev *dev, + struct sk_buff *skb, u16 hash, + bool rx_wlan) +{ +} #endif #define NPU_NUM_CORES 8 -- cgit v1.2.3 From f1241200cd66b3e25fd2a44dd961d9720e965aa1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:07:00 +0000 Subject: tcp: Don't pass hashinfo to inet_diag helpers. These inet_diag functions required struct inet_hashinfo because they are shared by TCP and DCCP: * inet_diag_dump_icsk() * inet_diag_dump_one_icsk() * inet_diag_find_one_icsk() DCCP has gone, and we don't need to pass hashinfo down to them. Let's fetch net->ipv4.tcp_death_row.hashinfo directly in the first 2 functions. Note that inet_diag_find_one_icsk() don't need hashinfo since the previous patch. We will move TCP-specific functions to tcp_diag.c in the next patch. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250822190803.540788-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 6 ++---- net/ipv4/inet_diag.c | 10 +++++----- net/ipv4/tcp_diag.c | 17 +++-------------- 3 files changed, 10 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index a9033696b0aa..34de992b5bd9 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -48,15 +48,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin); -void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, +void inet_diag_dump_icsk(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r); -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct netlink_callback *cb, +int inet_diag_dump_one_icsk(struct netlink_callback *cb, const struct inet_diag_req_v2 *req); struct sock *inet_diag_find_one_icsk(struct net *net, - struct inet_hashinfo *hashinfo, const struct inet_diag_req_v2 *req); int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 462406948c84..5cbbb0695aff 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -519,7 +519,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, } struct sock *inet_diag_find_one_icsk(struct net *net, - struct inet_hashinfo *hashinfo, const struct inet_diag_req_v2 *req) { struct sock *sk; @@ -562,8 +561,7 @@ struct sock *inet_diag_find_one_icsk(struct net *net, } EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct netlink_callback *cb, +int inet_diag_dump_one_icsk(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; @@ -573,7 +571,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sock *sk; int err; - sk = inet_diag_find_one_icsk(net, hashinfo, req); + sk = inet_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); @@ -1018,7 +1016,7 @@ static void twsk_build_assert(void) #endif } -void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, +void inet_diag_dump_icsk(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { @@ -1026,10 +1024,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct inet_diag_dump_data *cb_data = cb->data; struct net *net = sock_net(skb->sk); u32 idiag_states = r->idiag_states; + struct inet_hashinfo *hashinfo; int i, num, s_i, s_num; struct nlattr *bc; struct sock *sk; + hashinfo = net->ipv4.tcp_death_row.hashinfo; bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 45e174b8cd22..7cd9d032efdd 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -180,21 +180,13 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - struct inet_hashinfo *hinfo; - - hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; - - inet_diag_dump_icsk(hinfo, skb, cb, r); + inet_diag_dump_icsk(skb, cb, r); } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - struct inet_hashinfo *hinfo; - - hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; - - return inet_diag_dump_one_icsk(hinfo, cb, req); + return inet_diag_dump_one_icsk(cb, req); } #ifdef CONFIG_INET_DIAG_DESTROY @@ -202,13 +194,10 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { struct net *net = sock_net(in_skb->sk); - struct inet_hashinfo *hinfo; struct sock *sk; int err; - hinfo = net->ipv4.tcp_death_row.hashinfo; - sk = inet_diag_find_one_icsk(net, hinfo, req); - + sk = inet_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); -- cgit v1.2.3 From 382a4d9cb6dc07643345e15c49738088a727d29b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Aug 2025 19:07:01 +0000 Subject: tcp: Move TCP-specific diag functions to tcp_diag.c. tcp_diag_dump() / tcp_diag_dump_one() is just a wrapper of inet_diag_dump_icsk() / inet_diag_dump_one_icsk(), respectively. Let's inline them in tcp_diag.c and move static callees as well. Note that inet_sk_attr_size() is merged into tcp_diag_get_aux_size(), and we remove inet_diag_handler.idiag_get_aux_size() accordingly. While at it, BUG_ON() is replaced with DEBUG_NET_WARN_ON_ONCE(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250822190803.540788-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 11 -- net/ipv4/inet_diag.c | 479 ---------------------------------------------- net/ipv4/tcp_diag.c | 460 +++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 455 insertions(+), 495 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 34de992b5bd9..30bf8f7ea62b 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -24,9 +24,6 @@ struct inet_diag_handler { bool net_admin, struct sk_buff *skb); - size_t (*idiag_get_aux_size)(struct sock *sk, - bool net_admin); - int (*destroy)(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req); @@ -48,14 +45,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin); -void inet_diag_dump_icsk(struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r); -int inet_diag_dump_one_icsk(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req); - -struct sock *inet_diag_find_one_icsk(struct net *net, - const struct inet_diag_req_v2 *req); int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5cbbb0695aff..9d4dcd17728c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -20,9 +20,6 @@ #include #include #include -#include -#include -#include #include #include @@ -97,31 +94,6 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) } EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); -static size_t inet_sk_attr_size(struct sock *sk, - const struct inet_diag_req_v2 *req, - bool net_admin) -{ - const struct inet_diag_handler *handler; - size_t aux = 0; - - rcu_read_lock(); - handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]); - DEBUG_NET_WARN_ON_ONCE(!handler); - if (handler && handler->idiag_get_aux_size) - aux = handler->idiag_get_aux_size(sk, net_admin); - rcu_read_unlock(); - - return nla_total_size(sizeof(struct tcp_info)) - + nla_total_size(sizeof(struct inet_diag_msg)) - + inet_diag_msg_attrs_size() - + nla_total_size(sizeof(struct inet_diag_meminfo)) - + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) - + nla_total_size(TCP_CA_NAME_MAX) - + nla_total_size(sizeof(struct tcpvegas_info)) - + aux - + 64; -} - int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, struct user_namespace *user_ns, @@ -422,181 +394,6 @@ errout: } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); -static int inet_twsk_diag_fill(struct sock *sk, - struct sk_buff *skb, - struct netlink_callback *cb, - u16 nlmsg_flags, bool net_admin) -{ - struct inet_timewait_sock *tw = inet_twsk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, - sizeof(*r), nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - BUG_ON(tw->tw_state != TCP_TIME_WAIT); - - inet_diag_msg_common_fill(r, sk); - r->idiag_retrans = 0; - - r->idiag_state = READ_ONCE(tw->tw_substate); - r->idiag_timer = 3; - tmo = tw->tw_timer.expires - jiffies; - r->idiag_expires = jiffies_delta_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; - - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - tw->tw_mark)) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } - - nlmsg_end(skb, nlh); - return 0; -} - -static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, - u16 nlmsg_flags, bool net_admin) -{ - struct request_sock *reqsk = inet_reqsk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - inet_diag_msg_common_fill(r, sk); - r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; - r->idiag_retrans = reqsk->num_retrans; - - BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != - offsetof(struct sock, sk_cookie)); - - tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; - r->idiag_expires = jiffies_delta_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; - - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - inet_rsk(reqsk)->ir_mark)) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } - - nlmsg_end(skb, nlh); - return 0; -} - -static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - u16 nlmsg_flags, bool net_admin) -{ - if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - - if (sk->sk_state == TCP_NEW_SYN_RECV) - return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - - return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, - net_admin); -} - -struct sock *inet_diag_find_one_icsk(struct net *net, - const struct inet_diag_req_v2 *req) -{ - struct sock *sk; - - rcu_read_lock(); - if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], - req->id.idiag_dport, req->id.idiag_src[0], - req->id.idiag_sport, req->id.idiag_if); -#if IS_ENABLED(CONFIG_IPV6) - else if (req->sdiag_family == AF_INET6) { - if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && - ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], - req->id.idiag_dport, req->id.idiag_src[3], - req->id.idiag_sport, req->id.idiag_if); - else - sk = inet6_lookup(net, NULL, 0, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if); - } -#endif - else { - rcu_read_unlock(); - return ERR_PTR(-EINVAL); - } - rcu_read_unlock(); - if (!sk) - return ERR_PTR(-ENOENT); - - if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { - sock_gen_put(sk); - return ERR_PTR(-ENOENT); - } - - return sk; -} -EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); - -int inet_diag_dump_one_icsk(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - struct sk_buff *in_skb = cb->skb; - bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); - struct net *net = sock_net(in_skb->sk); - struct sk_buff *rep; - struct sock *sk; - int err; - - sk = inet_diag_find_one_icsk(net, req); - if (IS_ERR(sk)) - return PTR_ERR(sk); - - rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL); - if (!rep) { - err = -ENOMEM; - goto out; - } - - err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); - if (err < 0) { - WARN_ON(err == -EMSGSIZE); - nlmsg_free(rep); - goto out; - } - err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); - -out: - if (sk) - sock_gen_put(sk); - - return err; -} -EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); - static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, int hdrlen, @@ -990,282 +787,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return len == 0 ? 0 : -EINVAL; } -static void twsk_build_assert(void) -{ - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != - offsetof(struct sock, sk_family)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != - offsetof(struct inet_sock, inet_num)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != - offsetof(struct inet_sock, inet_dport)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != - offsetof(struct inet_sock, inet_rcv_saddr)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != - offsetof(struct inet_sock, inet_daddr)); - -#if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != - offsetof(struct sock, sk_v6_rcv_saddr)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != - offsetof(struct sock, sk_v6_daddr)); -#endif -} - -void inet_diag_dump_icsk(struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); - struct inet_diag_dump_data *cb_data = cb->data; - struct net *net = sock_net(skb->sk); - u32 idiag_states = r->idiag_states; - struct inet_hashinfo *hashinfo; - int i, num, s_i, s_num; - struct nlattr *bc; - struct sock *sk; - - hashinfo = net->ipv4.tcp_death_row.hashinfo; - bc = cb_data->inet_diag_nla_bc; - if (idiag_states & TCPF_SYN_RECV) - idiag_states |= TCPF_NEW_SYN_RECV; - s_i = cb->args[1]; - s_num = num = cb->args[2]; - - if (cb->args[0] == 0) { - if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) - goto skip_listen_ht; - - for (i = s_i; i <= hashinfo->lhash2_mask; i++) { - struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; - - num = 0; - ilb = &hashinfo->lhash2[i]; - - if (hlist_nulls_empty(&ilb->nulls_head)) { - s_num = 0; - continue; - } - spin_lock(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->nulls_head) { - struct inet_sock *inet = inet_sk(sk); - - if (!net_eq(sock_net(sk), net)) - continue; - - if (num < s_num) { - num++; - continue; - } - - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next_listen; - - if (r->id.idiag_sport != inet->inet_sport && - r->id.idiag_sport) - goto next_listen; - - if (!inet_diag_bc_sk(bc, sk)) - goto next_listen; - - if (inet_sk_diag_fill(sk, inet_csk(sk), skb, - cb, r, NLM_F_MULTI, - net_admin) < 0) { - spin_unlock(&ilb->lock); - goto done; - } - -next_listen: - ++num; - } - spin_unlock(&ilb->lock); - - s_num = 0; - } -skip_listen_ht: - cb->args[0] = 1; - s_i = num = s_num = 0; - } - -/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets - * with bh disabled. - */ -#define SKARR_SZ 16 - - /* Dump bound but inactive (not listening, connecting, etc.) sockets */ - if (cb->args[0] == 1) { - if (!(idiag_states & TCPF_BOUND_INACTIVE)) - goto skip_bind_ht; - - for (i = s_i; i < hashinfo->bhash_size; i++) { - struct inet_bind_hashbucket *ibb; - struct inet_bind2_bucket *tb2; - struct sock *sk_arr[SKARR_SZ]; - int num_arr[SKARR_SZ]; - int idx, accum, res; - -resume_bind_walk: - num = 0; - accum = 0; - ibb = &hashinfo->bhash2[i]; - - if (hlist_empty(&ibb->chain)) { - s_num = 0; - continue; - } - spin_lock_bh(&ibb->lock); - inet_bind_bucket_for_each(tb2, &ibb->chain) { - if (!net_eq(ib2_net(tb2), net)) - continue; - - sk_for_each_bound(sk, &tb2->owners) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_bind; - - if (sk->sk_state != TCP_CLOSE || - !inet->inet_num) - goto next_bind; - - if (r->sdiag_family != AF_UNSPEC && - r->sdiag_family != sk->sk_family) - goto next_bind; - - if (!inet_diag_bc_sk(bc, sk)) - goto next_bind; - - sock_hold(sk); - num_arr[accum] = num; - sk_arr[accum] = sk; - if (++accum == SKARR_SZ) - goto pause_bind_walk; -next_bind: - num++; - } - } -pause_bind_walk: - spin_unlock_bh(&ibb->lock); - - res = 0; - for (idx = 0; idx < accum; idx++) { - if (res >= 0) { - res = inet_sk_diag_fill(sk_arr[idx], - NULL, skb, cb, - r, NLM_F_MULTI, - net_admin); - if (res < 0) - num = num_arr[idx]; - } - sock_put(sk_arr[idx]); - } - if (res < 0) - goto done; - - cond_resched(); - - if (accum == SKARR_SZ) { - s_num = num + 1; - goto resume_bind_walk; - } - - s_num = 0; - } -skip_bind_ht: - cb->args[0] = 2; - s_i = num = s_num = 0; - } - - if (!(idiag_states & ~TCPF_LISTEN)) - goto out; - - for (i = s_i; i <= hashinfo->ehash_mask; i++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[i]; - spinlock_t *lock = inet_ehash_lockp(hashinfo, i); - struct hlist_nulls_node *node; - struct sock *sk_arr[SKARR_SZ]; - int num_arr[SKARR_SZ]; - int idx, accum, res; - - if (hlist_nulls_empty(&head->chain)) - continue; - - if (i > s_i) - s_num = 0; - -next_chunk: - num = 0; - accum = 0; - spin_lock_bh(lock); - sk_nulls_for_each(sk, node, &head->chain) { - int state; - - if (!net_eq(sock_net(sk), net)) - continue; - if (num < s_num) - goto next_normal; - state = (sk->sk_state == TCP_TIME_WAIT) ? - READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; - if (!(idiag_states & (1 << state))) - goto next_normal; - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next_normal; - if (r->id.idiag_sport != htons(sk->sk_num) && - r->id.idiag_sport) - goto next_normal; - if (r->id.idiag_dport != sk->sk_dport && - r->id.idiag_dport) - goto next_normal; - twsk_build_assert(); - - if (!inet_diag_bc_sk(bc, sk)) - goto next_normal; - - if (!refcount_inc_not_zero(&sk->sk_refcnt)) - goto next_normal; - - num_arr[accum] = num; - sk_arr[accum] = sk; - if (++accum == SKARR_SZ) - break; -next_normal: - ++num; - } - spin_unlock_bh(lock); - res = 0; - for (idx = 0; idx < accum; idx++) { - if (res >= 0) { - res = sk_diag_fill(sk_arr[idx], skb, cb, r, - NLM_F_MULTI, net_admin); - if (res < 0) - num = num_arr[idx]; - } - sock_gen_put(sk_arr[idx]); - } - if (res < 0) - break; - cond_resched(); - if (accum == SKARR_SZ) { - s_num = num + 1; - goto next_chunk; - } - } - -done: - cb->args[1] = i; - cb->args[2] = num; -out: - ; -} -EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); - static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 7cd9d032efdd..2f3a779ce7a2 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -12,6 +12,9 @@ #include +#include +#include +#include #include #include @@ -174,19 +177,467 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) size += ulp_ops->get_info_size(sk, net_admin); } } - return size; + + return size + + nla_total_size(sizeof(struct tcp_info)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + + nla_total_size(TCP_CA_NAME_MAX) + + nla_total_size(sizeof(struct tcpvegas_info)) + + 64; +} + +static int tcp_twsk_diag_fill(struct sock *sk, + struct sk_buff *skb, + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) +{ + struct inet_timewait_sock *tw = inet_twsk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, + sizeof(*r), nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + DEBUG_NET_WARN_ON_ONCE(tw->tw_state != TCP_TIME_WAIT); + + inet_diag_msg_common_fill(r, sk); + r->idiag_retrans = 0; + + r->idiag_state = READ_ONCE(tw->tw_substate); + r->idiag_timer = 3; + tmo = tw->tw_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + tw->tw_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) +{ + struct request_sock *reqsk = inet_reqsk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + inet_diag_msg_common_fill(r, sk); + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = reqsk->num_retrans; + + BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != + offsetof(struct sock, sk_cookie)); + + tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + inet_rsk(reqsk)->ir_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, + u16 nlmsg_flags, bool net_admin) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return tcp_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); + + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, + net_admin); +} + +static void twsk_build_assert(void) +{ + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != + offsetof(struct sock, sk_family)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != + offsetof(struct inet_sock, inet_num)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != + offsetof(struct inet_sock, inet_dport)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != + offsetof(struct inet_sock, inet_rcv_saddr)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != + offsetof(struct inet_sock, inet_daddr)); + +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != + offsetof(struct sock, sk_v6_rcv_saddr)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != + offsetof(struct sock, sk_v6_daddr)); +#endif } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - inet_diag_dump_icsk(skb, cb, r); + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct inet_diag_dump_data *cb_data = cb->data; + struct net *net = sock_net(skb->sk); + u32 idiag_states = r->idiag_states; + struct inet_hashinfo *hashinfo; + int i, num, s_i, s_num; + struct nlattr *bc; + struct sock *sk; + + hashinfo = net->ipv4.tcp_death_row.hashinfo; + bc = cb_data->inet_diag_nla_bc; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) + goto skip_listen_ht; + + for (i = s_i; i <= hashinfo->lhash2_mask; i++) { + struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; + + num = 0; + ilb = &hashinfo->lhash2[i]; + + if (hlist_nulls_empty(&ilb->nulls_head)) { + s_num = 0; + continue; + } + spin_lock(&ilb->lock); + sk_nulls_for_each(sk, node, &ilb->nulls_head) { + struct inet_sock *inet = inet_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; + + if (num < s_num) { + num++; + continue; + } + + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_listen; + + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next_listen; + + if (!inet_diag_bc_sk(bc, sk)) + goto next_listen; + + if (inet_sk_diag_fill(sk, inet_csk(sk), skb, + cb, r, NLM_F_MULTI, + net_admin) < 0) { + spin_unlock(&ilb->lock); + goto done; + } + +next_listen: + ++num; + } + spin_unlock(&ilb->lock); + + s_num = 0; + } +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + +/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets + * with bh disabled. + */ +#define SKARR_SZ 16 + + /* Dump bound but inactive (not listening, connecting, etc.) sockets */ + if (cb->args[0] == 1) { + if (!(idiag_states & TCPF_BOUND_INACTIVE)) + goto skip_bind_ht; + + for (i = s_i; i < hashinfo->bhash_size; i++) { + struct inet_bind_hashbucket *ibb; + struct inet_bind2_bucket *tb2; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + +resume_bind_walk: + num = 0; + accum = 0; + ibb = &hashinfo->bhash2[i]; + + if (hlist_empty(&ibb->chain)) { + s_num = 0; + continue; + } + spin_lock_bh(&ibb->lock); + inet_bind_bucket_for_each(tb2, &ibb->chain) { + if (!net_eq(ib2_net(tb2), net)) + continue; + + sk_for_each_bound(sk, &tb2->owners) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_bind; + + if (sk->sk_state != TCP_CLOSE || + !inet->inet_num) + goto next_bind; + + if (r->sdiag_family != AF_UNSPEC && + r->sdiag_family != sk->sk_family) + goto next_bind; + + if (!inet_diag_bc_sk(bc, sk)) + goto next_bind; + + sock_hold(sk); + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + goto pause_bind_walk; +next_bind: + num++; + } + } +pause_bind_walk: + spin_unlock_bh(&ibb->lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = inet_sk_diag_fill(sk_arr[idx], + NULL, skb, cb, + r, NLM_F_MULTI, + net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_put(sk_arr[idx]); + } + if (res < 0) + goto done; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto resume_bind_walk; + } + + s_num = 0; + } +skip_bind_ht: + cb->args[0] = 2; + s_i = num = s_num = 0; + } + + if (!(idiag_states & ~TCPF_LISTEN)) + goto out; + + for (i = s_i; i <= hashinfo->ehash_mask; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + spinlock_t *lock = inet_ehash_lockp(hashinfo, i); + struct hlist_nulls_node *node; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + + if (hlist_nulls_empty(&head->chain)) + continue; + + if (i > s_i) + s_num = 0; + +next_chunk: + num = 0; + accum = 0; + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &head->chain) { + int state; + + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) + goto next_normal; + state = (sk->sk_state == TCP_TIME_WAIT) ? + READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; + if (!(idiag_states & (1 << state))) + goto next_normal; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_normal; + if (r->id.idiag_sport != htons(sk->sk_num) && + r->id.idiag_sport) + goto next_normal; + if (r->id.idiag_dport != sk->sk_dport && + r->id.idiag_dport) + goto next_normal; + twsk_build_assert(); + + if (!inet_diag_bc_sk(bc, sk)) + goto next_normal; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto next_normal; + + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + break; +next_normal: + ++num; + } + spin_unlock_bh(lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = sk_diag_fill(sk_arr[idx], skb, cb, r, + NLM_F_MULTI, net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_gen_put(sk_arr[idx]); + } + if (res < 0) + break; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto next_chunk; + } + } + +done: + cb->args[1] = i; + cb->args[2] = num; +out: + ; +} + +static struct sock *tcp_diag_find_one_icsk(struct net *net, + const struct inet_diag_req_v2 *req) +{ + struct sock *sk; + + rcu_read_lock(); + if (req->sdiag_family == AF_INET) { + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); +#if IS_ENABLED(CONFIG_IPV6) + } else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], + req->id.idiag_dport, req->id.idiag_src[3], + req->id.idiag_sport, req->id.idiag_if); + else + sk = inet6_lookup(net, NULL, 0, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); +#endif + } else { + rcu_read_unlock(); + return ERR_PTR(-EINVAL); + } + rcu_read_unlock(); + if (!sk) + return ERR_PTR(-ENOENT); + + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_gen_put(sk); + return ERR_PTR(-ENOENT); + } + + return sk; } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return inet_diag_dump_one_icsk(cb, req); + struct sk_buff *in_skb = cb->skb; + struct sk_buff *rep; + struct sock *sk; + struct net *net; + bool net_admin; + int err; + + net = sock_net(in_skb->sk); + sk = tcp_diag_find_one_icsk(net, req); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); + rep = nlmsg_new(tcp_diag_get_aux_size(sk, net_admin), GFP_KERNEL); + if (!rep) { + err = -ENOMEM; + goto out; + } + + err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + nlmsg_free(rep); + goto out; + } + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); + +out: + if (sk) + sock_gen_put(sk); + + return err; } #ifdef CONFIG_INET_DIAG_DESTROY @@ -197,7 +648,7 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, struct sock *sk; int err; - sk = inet_diag_find_one_icsk(net, req); + sk = tcp_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); @@ -215,7 +666,6 @@ static const struct inet_diag_handler tcp_diag_handler = { .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_get_aux = tcp_diag_get_aux, - .idiag_get_aux_size = tcp_diag_get_aux_size, .idiag_type = IPPROTO_TCP, .idiag_info_size = sizeof(struct tcp_info), #ifdef CONFIG_INET_DIAG_DESTROY -- cgit v1.2.3 From df534e757321ae6efe848a6a787098c22a390ac6 Mon Sep 17 00:00:00 2001 From: David Yang Date: Sun, 24 Aug 2025 09:30:03 +0800 Subject: net: phylink: remove stale an_enabled from doc state->an_enabled was removed by commit 4ee9b0dcf09f ("net: phylink: remove an_enabled") but is left in mac_config() doc, so clean it. Signed-off-by: David Yang Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/20250824013009.2443580-1-mmyangfl@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 30659b615fca..9af0411761d7 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -320,9 +320,8 @@ int mac_prepare(struct phylink_config *config, unsigned int mode, * If in 802.3z mode, the link speed is fixed, dependent on the * @state->interface. Duplex and pause modes are negotiated via * the in-band configuration word. Advertised pause modes are set - * according to the @state->an_enabled and @state->advertising - * flags. Beware of MACs which only support full duplex at gigabit - * and higher speeds. + * according to @state->advertising. Beware of MACs which only + * support full duplex at gigabit and higher speeds. * * If in Cisco SGMII mode, the link speed and duplex mode are passed * in the serial bitstream 16-bit configuration word, and the MAC @@ -331,7 +330,7 @@ int mac_prepare(struct phylink_config *config, unsigned int mode, * responsible for reading the configuration word and configuring * itself accordingly. * - * Valid state members: interface, an_enabled, pause, advertising. + * Valid state members: interface, pause, advertising. * * Implementations are expected to update the MAC to reflect the * requested settings - i.o.w., if nothing has changed between two -- cgit v1.2.3 From 1b93c03fb319d72a1f5f4723abd5df15ce40f4e2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 21 Aug 2025 17:06:03 +0800 Subject: rcu: add rcu_read_lock_dont_migrate() migrate_disable() is called to disable migration in the kernel, and it is often used together with rcu_read_lock(). However, with PREEMPT_RCU disabled, it's unnecessary, as rcu_read_lock() will always disable preemption, which will also disable migration. Introduce rcu_read_lock_dont_migrate() and rcu_read_unlock_migrate(), which will do the migration enable and disable only when PREEMPT_RCU. Signed-off-by: Menglong Dong Reviewed-by: Paul E. McKenney Link: https://lore.kernel.org/r/20250821090609.42508-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/rcupdate.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb..9691ca380a4f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -962,6 +962,20 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) preempt_enable_notrace(); } +static __always_inline void rcu_read_lock_dont_migrate(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + migrate_disable(); + rcu_read_lock(); +} + +static inline void rcu_read_unlock_migrate(void) +{ + rcu_read_unlock(); + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + migrate_enable(); +} + /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. -- cgit v1.2.3 From 05db35963eef7a55f1782190185cb8ddb9d923b7 Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Chundru Date: Wed, 20 Aug 2025 13:58:47 +0530 Subject: OPP: Add support to find OPP for a set of keys Some clients, such as PCIe, may operate at the same clock frequency across different data rates by varying link width. In such cases, frequency alone is not sufficient to uniquely identify an OPP. To support these scenarios, introduce a new API dev_pm_opp_find_key_exact() that allows OPP lookup with different set of keys like freq, level & bandwidth. Signed-off-by: Krishna Chaitanya Chundru [ Viresh: Minor cleanups ] Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pm_opp.h | 30 +++++++++++++++ 2 files changed, 129 insertions(+) (limited to 'include/linux') diff --git a/drivers/opp/core.c b/drivers/opp/core.c index edbd60501cf0..bba4f7daff8c 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -476,6 +476,16 @@ static unsigned long _read_bw(struct dev_pm_opp *opp, int index) return opp->bandwidth[index].peak; } +static unsigned long _read_opp_key(struct dev_pm_opp *opp, int index, + struct dev_pm_opp_key *key) +{ + key->bw = opp->bandwidth ? opp->bandwidth[index].peak : 0; + key->freq = opp->rates[index]; + key->level = opp->level; + + return true; +} + /* Generic comparison helpers */ static bool _compare_exact(struct dev_pm_opp **opp, struct dev_pm_opp *temp_opp, unsigned long opp_key, unsigned long key) @@ -509,6 +519,22 @@ static bool _compare_floor(struct dev_pm_opp **opp, struct dev_pm_opp *temp_opp, return false; } +static bool _compare_opp_key_exact(struct dev_pm_opp **opp, + struct dev_pm_opp *temp_opp, struct dev_pm_opp_key *opp_key, + struct dev_pm_opp_key *key) +{ + bool level_match = (key->level == OPP_LEVEL_UNSET || opp_key->level == key->level); + bool freq_match = (key->freq == 0 || opp_key->freq == key->freq); + bool bw_match = (key->bw == 0 || opp_key->bw == key->bw); + + if (freq_match && level_match && bw_match) { + *opp = temp_opp; + return true; + } + + return false; +} + /* Generic key finding helpers */ static struct dev_pm_opp *_opp_table_find_key(struct opp_table *opp_table, unsigned long *key, int index, bool available, @@ -541,6 +567,37 @@ static struct dev_pm_opp *_opp_table_find_key(struct opp_table *opp_table, return opp; } +static struct dev_pm_opp *_opp_table_find_opp_key(struct opp_table *opp_table, + struct dev_pm_opp_key *key, bool available, + unsigned long (*read)(struct dev_pm_opp *opp, int index, + struct dev_pm_opp_key *key), + bool (*compare)(struct dev_pm_opp **opp, struct dev_pm_opp *temp_opp, + struct dev_pm_opp_key *opp_key, struct dev_pm_opp_key *key), + bool (*assert)(struct opp_table *opp_table, unsigned int index)) +{ + struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE); + struct dev_pm_opp_key temp_key; + + /* Assert that the requirement is met */ + if (!assert(opp_table, 0)) + return ERR_PTR(-EINVAL); + + guard(mutex)(&opp_table->lock); + + list_for_each_entry(temp_opp, &opp_table->opp_list, node) { + if (temp_opp->available == available) { + read(temp_opp, 0, &temp_key); + if (compare(&opp, temp_opp, &temp_key, key)) { + /* Increment the reference count of OPP */ + dev_pm_opp_get(opp); + break; + } + } + } + + return opp; +} + static struct dev_pm_opp * _find_key(struct device *dev, unsigned long *key, int index, bool available, unsigned long (*read)(struct dev_pm_opp *opp, int index), @@ -632,6 +689,48 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, } EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact); +/** + * dev_pm_opp_find_key_exact() - Search for an OPP with exact key set + * @dev: Device for which the OPP is being searched + * @key: OPP key set to match + * @available: true/false - match for available OPP + * + * Search for an exact match of the key set in the OPP table. + * + * Return: A matching opp on success, else ERR_PTR in case of error. + * Possible error values: + * EINVAL: for bad pointers + * ERANGE: no match found for search + * ENODEV: if device not found in list of registered devices + * + * Note: 'available' is a modifier for the search. If 'available' == true, + * then the match is for exact matching key and is available in the stored + * OPP table. If false, the match is for exact key which is not available. + * + * This provides a mechanism to enable an OPP which is not available currently + * or the opposite as well. + * + * The callers are required to call dev_pm_opp_put() for the returned OPP after + * use. + */ +struct dev_pm_opp *dev_pm_opp_find_key_exact(struct device *dev, + struct dev_pm_opp_key *key, + bool available) +{ + struct opp_table *opp_table __free(put_opp_table) = _find_opp_table(dev); + + if (IS_ERR(opp_table)) { + dev_err(dev, "%s: OPP table not found (%ld)\n", __func__, + PTR_ERR(opp_table)); + return ERR_CAST(opp_table); + } + + return _opp_table_find_opp_key(opp_table, key, available, + _read_opp_key, _compare_opp_key_exact, + assert_single_clk); +} +EXPORT_SYMBOL_GPL(dev_pm_opp_find_key_exact); + /** * dev_pm_opp_find_freq_exact_indexed() - Search for an exact freq for the * clock corresponding to the index diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index cf477beae4bb..789406d95e69 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -98,6 +98,25 @@ struct dev_pm_opp_data { unsigned long u_volt; }; +/** + * struct dev_pm_opp_key - Key used to identify OPP entries + * @freq: Frequency in Hz. Use 0 if frequency is not to be matched. + * @level: Performance level associated with the OPP entry. + * Use OPP_LEVEL_UNSET if level is not to be matched. + * @bw: Bandwidth associated with the OPP entry. + * Use 0 if bandwidth is not to be matched. + * + * This structure is used to uniquely identify an OPP entry based on + * frequency, performance level, and bandwidth. Each field can be + * selectively ignored during matching by setting it to its respective + * NOP value. + */ +struct dev_pm_opp_key { + unsigned long freq; + unsigned int level; + u32 bw; +}; + #if defined(CONFIG_PM_OPP) struct opp_table *dev_pm_opp_get_opp_table(struct device *dev); @@ -131,6 +150,10 @@ struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, unsigned long freq, bool available); +struct dev_pm_opp *dev_pm_opp_find_key_exact(struct device *dev, + struct dev_pm_opp_key *key, + bool available); + struct dev_pm_opp * dev_pm_opp_find_freq_exact_indexed(struct device *dev, unsigned long freq, u32 index, bool available); @@ -289,6 +312,13 @@ static inline struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev, return ERR_PTR(-EOPNOTSUPP); } +static inline struct dev_pm_opp *dev_pm_opp_find_key_exact(struct device *dev, + struct dev_pm_opp_key *key, + bool available) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline struct dev_pm_opp * dev_pm_opp_find_freq_exact_indexed(struct device *dev, unsigned long freq, u32 index, bool available) -- cgit v1.2.3 From e649bcda25b5ae1a30a182cc450f928a0b282c93 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 20 Aug 2025 14:03:39 -0400 Subject: perf: Remove get_perf_callchain() init_nr argument The 'init_nr' argument has double duty: it's used to initialize both the number of contexts and the number of stack entries. That's confusing and the callers always pass zero anyway. Hard code the zero. Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250820180428.259565081@kernel.org --- include/linux/perf_event.h | 2 +- kernel/bpf/stackmap.c | 4 ++-- kernel/events/callchain.c | 12 ++++++------ kernel/events/core.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bfbf9ea53f25..fd1d91017b99 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1719,7 +1719,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa..ec3a57a5fba1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, if (max_depth > sysctl_perf_event_max_stack) max_depth = sysctl_perf_event_max_stack; - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, false, false); if (unlikely(!trace)) @@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else if (kernel && task) trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + trace = get_perf_callchain(regs, kernel, user, max_depth, crosstask, false); if (unlikely(!trace) || trace->nr < skip) { diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 6c83ad674d01..b0f5bd228cd8 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -217,7 +217,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr } struct perf_callchain_entry * -get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, +get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; @@ -228,11 +228,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (!entry) return NULL; - ctx.entry = entry; - ctx.max_stack = max_stack; - ctx.nr = entry->nr = init_nr; - ctx.contexts = 0; - ctx.contexts_maxed = false; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = 0; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) diff --git a/kernel/events/core.c b/kernel/events/core.c index ea357044d780..bade8e0fced7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8210,7 +8210,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, 0, kernel, user, + callchain = get_perf_callchain(regs, kernel, user, max_stack, crosstask, true); return callchain ?: &__empty_callchain; } -- cgit v1.2.3 From 1abe21ef1adf0c5b6dbb5878c2fa4573df8d29fc Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 23 Aug 2025 15:44:28 +0200 Subject: net: phy: introduce phy_id_compare_vendor() PHY ID helper Introduce phy_id_compare_vendor() PHY ID helper to compare a PHY ID with the PHY ID Vendor using the generic PHY ID Vendor mask. While at it also rework the PHY_ID_MATCH macro and move the mask to dedicated define so that PHY driver can make use of the mask if needed. Signed-off-by: Christian Marangi Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250823134431.4854-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 4c2b8b6e7187..b67079796402 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1268,9 +1268,13 @@ struct phy_driver { #define to_phy_driver(d) container_of_const(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) -#define PHY_ID_MATCH_EXACT(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 0) -#define PHY_ID_MATCH_MODEL(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 4) -#define PHY_ID_MATCH_VENDOR(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 10) +#define PHY_ID_MATCH_EXTACT_MASK GENMASK(31, 0) +#define PHY_ID_MATCH_MODEL_MASK GENMASK(31, 4) +#define PHY_ID_MATCH_VENDOR_MASK GENMASK(31, 10) + +#define PHY_ID_MATCH_EXACT(id) .phy_id = (id), .phy_id_mask = PHY_ID_MATCH_EXTACT_MASK +#define PHY_ID_MATCH_MODEL(id) .phy_id = (id), .phy_id_mask = PHY_ID_MATCH_MODEL_MASK +#define PHY_ID_MATCH_VENDOR(id) .phy_id = (id), .phy_id_mask = PHY_ID_MATCH_VENDOR_MASK /** * phy_id_compare - compare @id1 with @id2 taking account of @mask @@ -1286,6 +1290,19 @@ static inline bool phy_id_compare(u32 id1, u32 id2, u32 mask) return !((id1 ^ id2) & mask); } +/** + * phy_id_compare_vendor - compare @id with @vendor mask + * @id: PHY ID + * @vendor_mask: PHY Vendor mask + * + * Return: true if the bits from @id match @vendor using the + * generic PHY Vendor mask. + */ +static inline bool phy_id_compare_vendor(u32 id, u32 vendor_mask) +{ + return phy_id_compare(id, vendor_mask, PHY_ID_MATCH_VENDOR_MASK); +} + /** * phydev_id_compare - compare @id with the PHY's Clause 22 ID * @phydev: the PHY device -- cgit v1.2.3 From 39e94fdce45fa611abf48472873e4ba2f67228a3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 22 Aug 2025 22:36:11 +0200 Subject: net: phy: fixed: let fixed_phy_add always use addr 0 and remove return value We have only two users of fixed_phy_add(), both use address 0 and ignore the return value. So simplify fixed_phy_add() accordingly. Whilst at it, constify the fixed_phy_status configs. Note: fixed_phy_add() is a legacy function which shouldn't be used in new code, as it's use may be problematic: - No check whether a fixed phy exists already at the given address - If fixed_phy_register() is called afterwards by any other driver, then it will also use phy_addr 0, because fixed_phy_add() ignores the ida which manages address assignment Drivers using a fixed phy created by fixed_phy_add() in platform code, should dynamically create a fixed phy with fixed_phy_register() instead. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/762700e5-a0b1-41af-aa03-929822a39475@gmail.com Signed-off-by: Jakub Kicinski --- arch/m68k/coldfire/m5272.c | 4 ++-- arch/mips/bcm47xx/setup.c | 4 ++-- drivers/net/phy/fixed_phy.c | 4 ++-- include/linux/phy_fixed.h | 8 ++------ 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/arch/m68k/coldfire/m5272.c b/arch/m68k/coldfire/m5272.c index 5b70dfdab368..918e2a3236c5 100644 --- a/arch/m68k/coldfire/m5272.c +++ b/arch/m68k/coldfire/m5272.c @@ -108,7 +108,7 @@ void __init config_BSP(char *commandp, int size) * an ethernet switch. In this case we need to use the fixed phy type, * and we need to declare it early in boot. */ -static struct fixed_phy_status nettel_fixed_phy_status __initdata = { +static const struct fixed_phy_status nettel_fixed_phy_status __initconst = { .link = 1, .speed = 100, .duplex = 0, @@ -119,7 +119,7 @@ static struct fixed_phy_status nettel_fixed_phy_status __initdata = { static int __init init_BSP(void) { m5272_uarts_init(); - fixed_phy_add(0, &nettel_fixed_phy_status); + fixed_phy_add(&nettel_fixed_phy_status); clkdev_add_table(m5272_clk_lookup, ARRAY_SIZE(m5272_clk_lookup)); return 0; } diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index de426a474b5b..a93a4266dc1e 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -256,7 +256,7 @@ static int __init bcm47xx_cpu_fixes(void) } arch_initcall(bcm47xx_cpu_fixes); -static struct fixed_phy_status bcm47xx_fixed_phy_status __initdata = { +static const struct fixed_phy_status bcm47xx_fixed_phy_status __initconst = { .link = 1, .speed = SPEED_100, .duplex = DUPLEX_FULL, @@ -282,7 +282,7 @@ static int __init bcm47xx_register_bus_complete(void) bcm47xx_leds_register(); bcm47xx_workarounds(); - fixed_phy_add(0, &bcm47xx_fixed_phy_status); + fixed_phy_add(&bcm47xx_fixed_phy_status); return 0; } device_initcall(bcm47xx_register_bus_complete); diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c index 8ad49dc1105d..5a1badb9bbab 100644 --- a/drivers/net/phy/fixed_phy.c +++ b/drivers/net/phy/fixed_phy.c @@ -158,9 +158,9 @@ static int fixed_phy_add_gpiod(unsigned int irq, int phy_addr, return 0; } -int fixed_phy_add(int phy_addr, const struct fixed_phy_status *status) +void fixed_phy_add(const struct fixed_phy_status *status) { - return fixed_phy_add_gpiod(PHY_POLL, phy_addr, status, NULL); + fixed_phy_add_gpiod(PHY_POLL, 0, status, NULL); } EXPORT_SYMBOL_GPL(fixed_phy_add); diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 5399b9e41e35..6227a1bdefec 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -17,7 +17,7 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); -int fixed_phy_add(int phy_id, const struct fixed_phy_status *status); +void fixed_phy_add(const struct fixed_phy_status *status); struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np); @@ -26,11 +26,7 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, struct fixed_phy_status *)); #else -static inline int fixed_phy_add(int phy_id, - const struct fixed_phy_status *status) -{ - return -ENODEV; -} +static inline void fixed_phy_add(const struct fixed_phy_status *status) {} static inline struct phy_device * fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np) -- cgit v1.2.3 From 19a9a1ab5c3dce65fff4ac50700117039c23d525 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:32 -0700 Subject: KVM: Rename CONFIG_KVM_PRIVATE_MEM to CONFIG_KVM_GUEST_MEMFD Rename the Kconfig option CONFIG_KVM_PRIVATE_MEM to CONFIG_KVM_GUEST_MEMFD. The original name implied that the feature only supported "private" memory. However, CONFIG_KVM_PRIVATE_MEM enables guest_memfd in general, which is not exclusively for private memory. Subsequent patches in this series will add guest_memfd support for non-CoCo VMs, whose memory is not private. Renaming the Kconfig option to CONFIG_KVM_GUEST_MEMFD more accurately reflects its broader scope as the main Kconfig option for all guest_memfd-backed memory. This provides clearer semantics for the option and avoids confusion as new features are introduced. Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- include/linux/kvm_host.h | 14 +++++++------- virt/kvm/Kconfig | 8 ++++---- virt/kvm/Makefile.kvm | 2 +- virt/kvm/kvm_main.c | 4 ++-- virt/kvm/kvm_mm.h | 4 ++-- 6 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f19a76d3ca0e..7b0f2b3e492d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2276,7 +2276,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) #else #define kvm_arch_has_private_mem(kvm) false diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15656b7fba6c..8cdc0b3cc1b1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -602,7 +602,7 @@ struct kvm_memory_slot { short id; u16 as_id; -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD struct { /* * Writes protected by kvm->slots_lock. Acquiring a @@ -720,10 +720,10 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) #endif /* - * Arch code must define kvm_arch_has_private_mem if support for private memory - * is enabled. + * Arch code must define kvm_arch_has_private_mem if support for guest_memfd is + * enabled. */ -#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) static inline bool kvm_arch_has_private_mem(struct kvm *kvm) { return false; @@ -2505,7 +2505,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { - return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) && + return IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) && kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; } #else @@ -2515,7 +2515,7 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) } #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, struct page **page, int *max_order); @@ -2528,7 +2528,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, KVM_BUG_ON(1, kvm); return -EIO; } -#endif /* CONFIG_KVM_PRIVATE_MEM */ +#endif /* CONFIG_KVM_GUEST_MEMFD */ #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 727b542074e7..e4b400feff94 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -112,19 +112,19 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES depends on KVM_GENERIC_MMU_NOTIFIER bool -config KVM_PRIVATE_MEM +config KVM_GUEST_MEMFD select XARRAY_MULTI bool config KVM_GENERIC_PRIVATE_MEM select KVM_GENERIC_MEMORY_ATTRIBUTES - select KVM_PRIVATE_MEM + select KVM_GUEST_MEMFD bool config HAVE_KVM_ARCH_GMEM_PREPARE bool - depends on KVM_PRIVATE_MEM + depends on KVM_GUEST_MEMFD config HAVE_KVM_ARCH_GMEM_INVALIDATE bool - depends on KVM_PRIVATE_MEM + depends on KVM_GUEST_MEMFD diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm index 724c89af78af..d047d4cf58c9 100644 --- a/virt/kvm/Makefile.kvm +++ b/virt/kvm/Makefile.kvm @@ -12,4 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o -kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o +kvm-$(CONFIG_KVM_GUEST_MEMFD) += $(KVM)/guest_memfd.o diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c07dd423458..25a94eed75fd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4915,7 +4915,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_MEMORY_ATTRIBUTES: return kvm_supported_mem_attributes(kvm); #endif -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: return !kvm || kvm_arch_has_private_mem(kvm); #endif @@ -5352,7 +5352,7 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_GET_STATS_FD: r = kvm_vm_ioctl_get_stats_fd(kvm); break; -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CREATE_GUEST_MEMFD: { struct kvm_create_guest_memfd guest_memfd; diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index acef3f5c582a..31defb08ccba 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -67,7 +67,7 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, } #endif /* HAVE_KVM_PFNCACHE */ -#ifdef CONFIG_KVM_PRIVATE_MEM +#ifdef CONFIG_KVM_GUEST_MEMFD void kvm_gmem_init(struct module *module); int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -91,6 +91,6 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot) { WARN_ON_ONCE(1); } -#endif /* CONFIG_KVM_PRIVATE_MEM */ +#endif /* CONFIG_KVM_GUEST_MEMFD */ #endif /* __KVM_MM_H__ */ -- cgit v1.2.3 From 36cf63bb5df68836e55e2839f8174b404d47670b Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:36 -0700 Subject: KVM: Rename CONFIG_KVM_GENERIC_PRIVATE_MEM to CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE The original name was vague regarding its functionality. This Kconfig option specifically enables and gates the kvm_gmem_populate() function, which is responsible for populating a GPA range with guest data. The new name, HAVE_KVM_ARCH_GMEM_POPULATE, describes the purpose of the option: to enable arch-specific guest_memfd population mechanisms. It also follows the same pattern as the other HAVE_KVM_ARCH_* configuration options. This improves clarity for developers and ensures the name accurately reflects the functionality it controls, especially as guest_memfd support expands beyond purely "private" memory scenarios. Temporarily keep KVM_GENERIC_PRIVATE_MEM as an x86-only config so as to minimize churn, and to hopefully make it easier to see what features require HAVE_KVM_ARCH_GMEM_POPULATE. On that note, omit GMEM_POPULATE for KVM_X86_SW_PROTECTED_VM, as regular ol' memset() suffices for software-protected VMs. As for KVM_GENERIC_PRIVATE_MEM, a future change will select KVM_GUEST_MEMFD for all 64-bit KVM builds, at which point the intermediate config will become obsolete and can/will be dropped. Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Fuad Tabba Reviewed-by: Xiaoyao Li Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Kconfig | 14 ++++++++++---- include/linux/kvm_host.h | 2 +- virt/kvm/Kconfig | 9 ++++----- virt/kvm/guest_memfd.c | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 13ab7265b505..c763446d9b9f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -79,11 +79,16 @@ config KVM_WERROR If in doubt, say "N". +config KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_GUEST_MEMFD + bool + config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT depends on KVM_X86 && X86_64 - select KVM_GENERIC_PRIVATE_MEM + select KVM_X86_PRIVATE_MEM help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for @@ -133,8 +138,8 @@ config KVM_INTEL_TDX bool "Intel Trust Domain Extensions (TDX) support" default y depends on INTEL_TDX_HOST - select KVM_GENERIC_PRIVATE_MEM - select KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_X86_PRIVATE_MEM + select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching Intel Trust Domain Extensions (TDX) confidential VMs on Intel processors. @@ -157,9 +162,10 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM - select KVM_GENERIC_PRIVATE_MEM + select KVM_X86_PRIVATE_MEM select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_INVALIDATE + select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching encrypted VMs which use Secure Encrypted Virtualization (SEV), Secure Encrypted Virtualization with diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8cdc0b3cc1b1..ddfb6cfe20a6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2534,7 +2534,7 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order); #endif -#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE /** * kvm_gmem_populate() - Populate/prepare a GPA range with guest data * diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index e4b400feff94..1b7d5be0b6c4 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -116,11 +116,6 @@ config KVM_GUEST_MEMFD select XARRAY_MULTI bool -config KVM_GENERIC_PRIVATE_MEM - select KVM_GENERIC_MEMORY_ATTRIBUTES - select KVM_GUEST_MEMFD - bool - config HAVE_KVM_ARCH_GMEM_PREPARE bool depends on KVM_GUEST_MEMFD @@ -128,3 +123,7 @@ config HAVE_KVM_ARCH_GMEM_PREPARE config HAVE_KVM_ARCH_GMEM_INVALIDATE bool depends on KVM_GUEST_MEMFD + +config HAVE_KVM_ARCH_GMEM_POPULATE + bool + depends on KVM_GUEST_MEMFD diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 7d85cc33c0bb..b2b50560e80e 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -627,7 +627,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); -#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { -- cgit v1.2.3 From 923310be23b275f730e8869abc783db6296fc043 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:37 -0700 Subject: KVM: Rename kvm_slot_can_be_private() to kvm_slot_has_gmem() Rename kvm_slot_can_be_private() to kvm_slot_has_gmem() to improve clarity and accurately reflect its purpose. The function kvm_slot_can_be_private() was previously used to check if a given kvm_memory_slot is backed by guest_memfd. However, its name implied that the memory in such a slot was exclusively "private". As guest_memfd support expands to include non-private memory (e.g., shared host mappings), it's important to remove this association. The new name, kvm_slot_has_gmem(), states that the slot is backed by guest_memfd without making assumptions about the memory's privacy attributes. Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- arch/x86/kvm/svm/sev.c | 4 ++-- include/linux/kvm_host.h | 2 +- virt/kvm/guest_memfd.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6e838cb6c9e1..fdc2824755ee 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3312,7 +3312,7 @@ static int __kvm_mmu_max_mapping_level(struct kvm *kvm, int kvm_mmu_max_mapping_level(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t gfn) { - bool is_private = kvm_slot_can_be_private(slot) && + bool is_private = kvm_slot_has_gmem(slot) && kvm_mem_is_private(kvm, gfn); return __kvm_mmu_max_mapping_level(kvm, slot, gfn, PG_LEVEL_NUM, is_private); @@ -4551,7 +4551,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, { int max_order, r; - if (!kvm_slot_can_be_private(fault->slot)) { + if (!kvm_slot_has_gmem(fault->slot)) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return -EFAULT; } diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0635bd71c10e..966a330dd294 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2361,7 +2361,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp) mutex_lock(&kvm->slots_lock); memslot = gfn_to_memslot(kvm, params.gfn_start); - if (!kvm_slot_can_be_private(memslot)) { + if (!kvm_slot_has_gmem(memslot)) { ret = -EINVAL; goto out; } @@ -4715,7 +4715,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) } slot = gfn_to_memslot(kvm, gfn); - if (!kvm_slot_can_be_private(slot)) { + if (!kvm_slot_has_gmem(slot)) { pr_warn_ratelimited("SEV: Unexpected RMP fault, non-private slot for GPA 0x%llx\n", gpa); return; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ddfb6cfe20a6..4c5e0a898652 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -615,7 +615,7 @@ struct kvm_memory_slot { #endif }; -static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) +static inline bool kvm_slot_has_gmem(const struct kvm_memory_slot *slot) { return slot && (slot->flags & KVM_MEM_GUEST_MEMFD); } diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index b2b50560e80e..a99e11b8b77f 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -643,7 +643,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long return -EINVAL; slot = gfn_to_memslot(kvm, start_gfn); - if (!kvm_slot_can_be_private(slot)) + if (!kvm_slot_has_gmem(slot)) return -EINVAL; file = kvm_gmem_get_file(slot); -- cgit v1.2.3 From 69116e01f6fee030db45d269f28f9c300b8dc9d6 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:38 -0700 Subject: KVM: Fix comments that refer to slots_lock Fix comments so that they refer to slots_lock instead of slots_locks (remove trailing s). Reviewed-by: David Hildenbrand Reviewed-by: Ira Weiny Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4c5e0a898652..5c25b03d3d50 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -860,7 +860,7 @@ struct kvm { struct notifier_block pm_notifier; #endif #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES - /* Protected by slots_locks (for writes) and RCU (for reads) */ + /* Protected by slots_lock (for writes) and RCU (for reads) */ struct xarray mem_attr_array; #endif char stats_id[KVM_STATS_NAME_SIZE]; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 25a94eed75fd..aa86dfd757db 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -331,7 +331,7 @@ void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, * All current use cases for flushing the TLBs for a specific memslot * are related to dirty logging, and many do the TLB flush out of * mmu_lock. The interaction between the various operations on memslot - * must be serialized by slots_locks to ensure the TLB flush from one + * must be serialized by slots_lock to ensure the TLB flush from one * operation is observed by any other operation on the same memslot. */ lockdep_assert_held(&kvm->slots_lock); -- cgit v1.2.3 From 68d189938709a5918d7308eb922c30bcbf16ebb9 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:39 -0700 Subject: KVM: Fix comment that refers to kvm uapi header path The comment that points to the path where the user-visible memslot flags are refers to an outdated path and has a typo. Update the comment to refer to the correct path. Reviewed-by: David Hildenbrand Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Vlastimil Babka Reviewed-by: Xiaoyao Li Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c25b03d3d50..56ea8c862cfd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -52,7 +52,7 @@ /* * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally * used in kvm, other bits are visible for userspace which are defined in - * include/linux/kvm_h. + * include/uapi/linux/kvm.h. */ #define KVM_MEMSLOT_INVALID (1UL << 16) -- cgit v1.2.3 From d1e54dd08f163a9021433020d16a8f8f70ddc41c Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:40 -0700 Subject: KVM: x86: Enable KVM_GUEST_MEMFD for all 64-bit builds Enable KVM_GUEST_MEMFD for all KVM x86 64-bit builds, i.e. for "default" VM types when running on 64-bit KVM. This will allow using guest_memfd to back non-private memory for all VM shapes, by supporting mmap() on guest_memfd. Opportunistically clean up various conditionals that become tautologies once x86 selects KVM_GUEST_MEMFD more broadly. Specifically, because SW protected VMs, SEV, and TDX are all 64-bit only, private memory no longer needs to take explicit dependencies on KVM_GUEST_MEMFD, because it is effectively a prerequisite. Suggested-by: Sean Christopherson Signed-off-by: Fuad Tabba Reviewed-by: Xiaoyao Li Reviewed-by: David Hildenbrand Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 +--- arch/x86/kvm/Kconfig | 12 ++++-------- include/linux/kvm_host.h | 9 ++------- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 9 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7b0f2b3e492d..50366a1ca192 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2276,10 +2276,8 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); -#ifdef CONFIG_KVM_GUEST_MEMFD +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem) -#else -#define kvm_arch_has_private_mem(kvm) false #endif #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index c763446d9b9f..4e43923656d0 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -47,6 +47,7 @@ config KVM_X86 select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_PRE_FAULT_MEMORY select KVM_WERROR if WERROR + select KVM_GUEST_MEMFD if X86_64 config KVM tristate "Kernel-based Virtual Machine (KVM) support" @@ -79,16 +80,11 @@ config KVM_WERROR If in doubt, say "N". -config KVM_X86_PRIVATE_MEM - select KVM_GENERIC_MEMORY_ATTRIBUTES - select KVM_GUEST_MEMFD - bool - config KVM_SW_PROTECTED_VM bool "Enable support for KVM software-protected VMs" depends on EXPERT depends on KVM_X86 && X86_64 - select KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES help Enable support for KVM software-protected VMs. Currently, software- protected VMs are purely a development and testing vehicle for @@ -138,7 +134,7 @@ config KVM_INTEL_TDX bool "Intel Trust Domain Extensions (TDX) support" default y depends on INTEL_TDX_HOST - select KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES select HAVE_KVM_ARCH_GMEM_POPULATE help Provides support for launching Intel Trust Domain Extensions (TDX) @@ -162,7 +158,7 @@ config KVM_AMD_SEV depends on KVM_AMD && X86_64 depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m) select ARCH_HAS_CC_PLATFORM - select KVM_X86_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES select HAVE_KVM_ARCH_GMEM_PREPARE select HAVE_KVM_ARCH_GMEM_INVALIDATE select HAVE_KVM_ARCH_GMEM_POPULATE diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 56ea8c862cfd..4d1c44622056 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -719,11 +719,7 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) } #endif -/* - * Arch code must define kvm_arch_has_private_mem if support for guest_memfd is - * enabled. - */ -#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) +#ifndef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static inline bool kvm_arch_has_private_mem(struct kvm *kvm) { return false; @@ -2505,8 +2501,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) { - return IS_ENABLED(CONFIG_KVM_GUEST_MEMFD) && - kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; + return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; } #else static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aa86dfd757db..4f57cb92e109 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1588,7 +1588,7 @@ static int check_memory_region_flags(struct kvm *kvm, { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; - if (kvm_arch_has_private_mem(kvm)) + if (IS_ENABLED(CONFIG_KVM_GUEST_MEMFD)) valid_flags |= KVM_MEM_GUEST_MEMFD; /* Dirty logging private memory is not currently supported. */ @@ -4917,7 +4917,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #endif #ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: - return !kvm || kvm_arch_has_private_mem(kvm); + return 1; #endif default: break; -- cgit v1.2.3 From a12578e1477cbfb547256ed8dee6d5142a59cdcd Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:41 -0700 Subject: KVM: guest_memfd: Add plumbing to host to map guest_memfd pages Introduce the core infrastructure to enable host userspace to mmap() guest_memfd-backed memory. This is needed for several evolving KVM use cases: * Non-CoCo VM backing: Allows VMMs like Firecracker to run guests entirely backed by guest_memfd, even for non-CoCo VMs [1]. This provides a unified memory management model and simplifies guest memory handling. * Direct map removal for enhanced security: This is an important step for direct map removal of guest memory [2]. By allowing host userspace to fault in guest_memfd pages directly, we can avoid maintaining host kernel direct maps of guest memory. This provides additional hardening against Spectre-like transient execution attacks by removing a potential attack surface within the kernel. * Future guest_memfd features: This also lays the groundwork for future enhancements to guest_memfd, such as supporting huge pages and enabling in-place sharing of guest memory with the host for CoCo platforms that permit it [3]. Enable the basic mmap and fault handling logic within guest_memfd, but hold off on allow userspace to actually do mmap() until the architecture support is also in place. [1] https://github.com/firecracker-microvm/firecracker/tree/feature/secret-hiding [2] https://lore.kernel.org/linux-mm/cc1bb8e9bc3e1ab637700a4d3defeec95b55060a.camel@amazon.com [3] https://lore.kernel.org/all/c1c9591d-218a-495c-957b-ba356c8f8e09@redhat.com/T/#u Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Acked-by: David Hildenbrand Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Signed-off-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 ++++++++ include/linux/kvm_host.h | 4 +++ virt/kvm/guest_memfd.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 604490b1cb19..33fba801b205 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13521,6 +13521,16 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); +#ifdef CONFIG_KVM_GUEST_MEMFD +/* + * KVM doesn't yet support mmap() on guest_memfd for VMs with private memory + * (the private vs. shared tracking needs to be moved into guest_memfd). + */ +bool kvm_arch_supports_gmem_mmap(struct kvm *kvm) +{ + return !kvm_arch_has_private_mem(kvm); +} + #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order) { @@ -13534,6 +13544,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end) kvm_x86_call(gmem_invalidate)(start, end); } #endif +#endif int kvm_spec_ctrl_test_value(u64 value) { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4d1c44622056..26bad600f9fa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -726,6 +726,10 @@ static inline bool kvm_arch_has_private_mem(struct kvm *kvm) } #endif +#ifdef CONFIG_KVM_GUEST_MEMFD +bool kvm_arch_supports_gmem_mmap(struct kvm *kvm); +#endif + #ifndef kvm_arch_has_readonly_mem static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm) { diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index a99e11b8b77f..67e7cd7210ef 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -312,7 +312,72 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) return gfn - slot->base_gfn + slot->gmem.pgoff; } +static bool kvm_gmem_supports_mmap(struct inode *inode) +{ + return false; +} + +static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct folio *folio; + vm_fault_t ret = VM_FAULT_LOCKED; + + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) + return VM_FAULT_SIGBUS; + + folio = kvm_gmem_get_folio(inode, vmf->pgoff); + if (IS_ERR(folio)) { + int err = PTR_ERR(folio); + + if (err == -EAGAIN) + return VM_FAULT_RETRY; + + return vmf_error(err); + } + + if (WARN_ON_ONCE(folio_test_large(folio))) { + ret = VM_FAULT_SIGBUS; + goto out_folio; + } + + if (!folio_test_uptodate(folio)) { + clear_highpage(folio_page(folio, 0)); + kvm_gmem_mark_prepared(folio); + } + + vmf->page = folio_file_page(folio, vmf->pgoff); + +out_folio: + if (ret != VM_FAULT_LOCKED) { + folio_unlock(folio); + folio_put(folio); + } + + return ret; +} + +static const struct vm_operations_struct kvm_gmem_vm_ops = { + .fault = kvm_gmem_fault_user_mapping, +}; + +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!kvm_gmem_supports_mmap(file_inode(file))) + return -ENODEV; + + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != + (VM_SHARED | VM_MAYSHARE)) { + return -EINVAL; + } + + vma->vm_ops = &kvm_gmem_vm_ops; + + return 0; +} + static struct file_operations kvm_gmem_fops = { + .mmap = kvm_gmem_mmap, .open = generic_file_open, .release = kvm_gmem_release, .fallocate = kvm_gmem_fallocate, @@ -391,6 +456,11 @@ static const struct inode_operations kvm_gmem_iops = { .setattr = kvm_gmem_setattr, }; +bool __weak kvm_arch_supports_gmem_mmap(struct kvm *kvm) +{ + return true; +} + static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { const char *anon_name = "[kvm-gmem]"; -- cgit v1.2.3 From 576d035e2aef52f8d8d3ce29af556d4c6bd2e0fe Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:42 -0700 Subject: KVM: guest_memfd: Track guest_memfd mmap support in memslot Add a new internal flag, KVM_MEMSLOT_GMEM_ONLY, to the top half of memslot->flags (which makes it strictly for KVM's internal use). This flag tracks when a guest_memfd-backed memory slot supports host userspace mmap operations, which implies that all memory, not just private memory for CoCo VMs, is consumed through guest_memfd: "gmem only". This optimization avoids repeatedly checking the underlying guest_memfd file for mmap support, which would otherwise require taking and releasing a reference on the file for each check. By caching this information directly in the memslot, we reduce overhead and simplify the logic involved in handling guest_memfd-backed pages for host mappings. Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Xiaoyao Li Acked-by: David Hildenbrand Suggested-by: David Hildenbrand Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 11 ++++++++++- virt/kvm/guest_memfd.c | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 26bad600f9fa..8b47891adca1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -54,7 +54,8 @@ * used in kvm, other bits are visible for userspace which are defined in * include/uapi/linux/kvm.h. */ -#define KVM_MEMSLOT_INVALID (1UL << 16) +#define KVM_MEMSLOT_INVALID (1UL << 16) +#define KVM_MEMSLOT_GMEM_ONLY (1UL << 17) /* * Bit 63 of the memslot generation number is an "update in-progress flag", @@ -2490,6 +2491,14 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } +static inline bool kvm_memslot_is_gmem_only(const struct kvm_memory_slot *slot) +{ + if (!IS_ENABLED(CONFIG_KVM_GUEST_MEMFD)) + return false; + + return slot->flags & KVM_MEMSLOT_GMEM_ONLY; +} + #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn) { diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 67e7cd7210ef..d5b445548af4 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -578,6 +578,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, */ WRITE_ONCE(slot->gmem.file, file); slot->gmem.pgoff = start; + if (kvm_gmem_supports_mmap(inode)) + slot->flags |= KVM_MEMSLOT_GMEM_ONLY; xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); filemap_invalidate_unlock(inode->i_mapping); -- cgit v1.2.3 From dd6a5a71c811289eec234e78cb9ca34d055d2ad5 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 10 Jun 2025 13:52:28 +0900 Subject: sched/wait: Add wait_event_state_exclusive() Allows exclusive waits with a custom @state. Signed-off-by: Sergey Senozhatsky Acked-by: Peter Zijlstra (Intel) Signed-off-by: Miklos Szeredi --- include/linux/wait.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 09855d819418..f648044466d5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -965,6 +965,18 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); __ret; \ }) +#define __wait_event_state_exclusive(wq, condition, state) \ + ___wait_event(wq, condition, state, 1, 0, schedule()) + +#define wait_event_state_exclusive(wq, condition, state) \ +({ \ + int __ret = 0; \ + might_sleep(); \ + if (!(condition)) \ + __ret = __wait_event_state_exclusive(wq, condition, state); \ + __ret; \ +}) + #define __wait_event_killable_timeout(wq_head, condition, timeout) \ ___wait_event(wq_head, ___wait_cond_timeout(condition), \ TASK_KILLABLE, 0, timeout, \ -- cgit v1.2.3 From 494d2f508883a6e5c4530e5c6b3c8b2bbfb7318d Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 7 Jul 2025 16:46:05 -0700 Subject: fuse: use default writeback accounting commit 0c58a97f919c ("fuse: remove tmp folio for writebacks and internal rb tree") removed temp folios for dirty page writeback. Consequently, fuse can now use the default writeback accounting. With switching fuse to use default writeback accounting, there are some added benefits. This updates wb->writeback_inodes tracking as well now and updates writeback throughput estimates after writeback completion. This commit also removes inc_wb_stat() and dec_wb_stat(). These have no callers anymore now that fuse does not call them. Signed-off-by: Joanne Koong Reviewed-by: David Hildenbrand Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 9 +-------- fs/fuse/inode.c | 2 -- include/linux/backing-dev.h | 10 ---------- 3 files changed, 1 insertion(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ffda61c9bc0c..0698bcb2f992 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1822,19 +1822,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) { + for (i = 0; i < ap->num_folios; i++) /* * Benchmarks showed that ending writeback within the * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ iomap_finish_folio_write(inode, ap->folios[i], 1); - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - wb_writeout_inc(&bdi->wb); - } wake_up(&fi->page_waitq); } @@ -2009,14 +2005,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, uint32_t folio_index, loff_t offset, unsigned len) { - struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; ap->folios[folio_index] = folio; ap->descs[folio_index].offset = offset; ap->descs[folio_index].length = len; - - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7ddfd2b3cc9c..19fc58cb84dc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1561,8 +1561,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e721148c95d0..9a1e895dd5df 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -66,16 +66,6 @@ static inline void wb_stat_mod(struct bdi_writeback *wb, percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) -{ - wb_stat_mod(wb, item, 1); -} - -static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) -{ - wb_stat_mod(wb, item, -1); -} - static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_read_positive(&wb->stat[item]); -- cgit v1.2.3 From 2841808f35eebfd07150333f3af3007cb2904a09 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 7 Jul 2025 16:46:06 -0700 Subject: mm: remove BDI_CAP_WRITEBACK_ACCT There are no users of BDI_CAP_WRITEBACK_ACCT now that fuse doesn't do its own writeback accounting. This commit removes BDI_CAP_WRITEBACK_ACCT. Signed-off-by: Joanne Koong Acked-by: David Hildenbrand Signed-off-by: Miklos Szeredi --- include/linux/backing-dev.h | 4 +--- mm/backing-dev.c | 2 +- mm/page-writeback.c | 43 ++++++++++++++++++------------------------- 3 files changed, 20 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9a1e895dd5df..3e64f14739dd 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -108,12 +108,10 @@ int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit * * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages * should contribute to accounting - * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold */ #define BDI_CAP_WRITEBACK (1 << 0) -#define BDI_CAP_WRITEBACK_ACCT (1 << 1) -#define BDI_CAP_STRICTLIMIT (1 << 2) +#define BDI_CAP_STRICTLIMIT (1 << 1) extern struct backing_dev_info noop_backing_dev_info; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..35f11e75e30e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1026,7 +1026,7 @@ struct backing_dev_info *bdi_alloc(int node_id) kfree(bdi); return NULL; } - bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; + bdi->capabilities = BDI_CAP_WRITEBACK; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e248d1c3969..de669636120d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3014,26 +3014,22 @@ bool __folio_end_writeback(struct folio *folio) if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = inode_to_wb(inode); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { - struct bdi_writeback *wb = inode_to_wb(inode); - wb_stat_mod(wb, WB_WRITEBACK, -nr); - __wb_writeout_add(wb, nr); - if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - wb_inode_writeback_end(wb); + wb_stat_mod(wb, WB_WRITEBACK, -nr); + __wb_writeout_add(wb, nr); + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + wb_inode_writeback_end(wb); + if (mapping->host) + sb_clear_inode_writeback(mapping->host); } - if (mapping->host && !mapping_tagged(mapping, - PAGECACHE_TAG_WRITEBACK)) - sb_clear_inode_writeback(mapping->host); - xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); @@ -3058,7 +3054,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = inode_to_wb(inode); unsigned long flags; bool on_wblist; @@ -3069,21 +3065,18 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { - struct bdi_writeback *wb = inode_to_wb(inode); - - wb_stat_mod(wb, WB_WRITEBACK, nr); - if (!on_wblist) - wb_inode_writeback_start(wb); + wb_stat_mod(wb, WB_WRITEBACK, nr); + if (!on_wblist) { + wb_inode_writeback_start(wb); + /* + * We can come through here when swapping anonymous + * folios, so we don't necessarily have an inode to + * track for sync. + */ + if (mapping->host) + sb_mark_inode_writeback(mapping->host); } - /* - * We can come through here when swapping anonymous - * folios, so we don't necessarily have an inode to - * track for sync. - */ - if (mapping->host && !on_wblist) - sb_mark_inode_writeback(mapping->host); if (!folio_test_dirty(folio)) xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) -- cgit v1.2.3 From 08383cd479f8212fafee2f557b58cfd48818bee0 Mon Sep 17 00:00:00 2001 From: Christian Bruel Date: Wed, 20 Aug 2025 09:54:02 +0200 Subject: pinctrl: Add pinctrl_pm_select_init_state helper function If a platform requires an initial pinctrl state during probing, this helper function provides the client with access to the same initial state. eg: xxx_suspend_noirq ... pinctrl_pm_select_sleep_state xxx resume_noirq pinctrl_pm_select_init_state ... pinctrl_pm_select_default_state Signed-off-by: Christian Bruel Signed-off-by: Manivannan Sadhasivam Reviewed-by: Linus Walleij Link: https://patch.msgid.link/20250820075411.1178729-3-christian.bruel@foss.st.com --- drivers/pinctrl/core.c | 13 +++++++++++++ include/linux/pinctrl/consumer.h | 10 ++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index 73b78d6eac67..c5dbf4e9db84 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -1655,6 +1655,19 @@ int pinctrl_pm_select_default_state(struct device *dev) } EXPORT_SYMBOL_GPL(pinctrl_pm_select_default_state); +/** + * pinctrl_pm_select_init_state() - select init pinctrl state for PM + * @dev: device to select init state for + */ +int pinctrl_pm_select_init_state(struct device *dev) +{ + if (!dev->pins) + return 0; + + return pinctrl_select_bound_state(dev, dev->pins->init_state); +} +EXPORT_SYMBOL_GPL(pinctrl_pm_select_init_state); + /** * pinctrl_pm_select_sleep_state() - select sleep pinctrl state for PM * @dev: device to select sleep state for diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h index 73de70362b98..63ce16191eb9 100644 --- a/include/linux/pinctrl/consumer.h +++ b/include/linux/pinctrl/consumer.h @@ -48,6 +48,7 @@ int pinctrl_select_default_state(struct device *dev); #ifdef CONFIG_PM int pinctrl_pm_select_default_state(struct device *dev); +int pinctrl_pm_select_init_state(struct device *dev); int pinctrl_pm_select_sleep_state(struct device *dev); int pinctrl_pm_select_idle_state(struct device *dev); #else @@ -55,6 +56,10 @@ static inline int pinctrl_pm_select_default_state(struct device *dev) { return 0; } +static inline int pinctrl_pm_select_init_state(struct device *dev) +{ + return 0; +} static inline int pinctrl_pm_select_sleep_state(struct device *dev) { return 0; @@ -143,6 +148,11 @@ static inline int pinctrl_pm_select_default_state(struct device *dev) return 0; } +static inline int pinctrl_pm_select_init_state(struct device *dev) +{ + return 0; +} + static inline int pinctrl_pm_select_sleep_state(struct device *dev) { return 0; -- cgit v1.2.3 From 1df7dad4d5c49335b72e26d833def960b2de76e3 Mon Sep 17 00:00:00 2001 From: Nandakumar Edamana Date: Tue, 26 Aug 2025 09:15:23 +0530 Subject: bpf: Improve the general precision of tnum_mul Drop the value-mask decomposition technique and adopt straightforward long-multiplication with a twist: when LSB(a) is uncertain, find the two partial products (for LSB(a) = known 0 and LSB(a) = known 1) and take a union. Experiment shows that applying this technique in long multiplication improves the precision in a significant number of cases (at the cost of losing precision in a relatively lower number of cases). Signed-off-by: Nandakumar Edamana Signed-off-by: Andrii Nakryiko Tested-by: Harishankar Vishwanathan Reviewed-by: Harishankar Vishwanathan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20250826034524.2159515-1-nandakumar@nandakumar.co.in --- include/linux/tnum.h | 3 +++ kernel/bpf/tnum.c | 55 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 45 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 0ffb77ffe0e8..c52b862dad45 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -57,6 +57,9 @@ bool tnum_overlap(struct tnum a, struct tnum b); /* Return a tnum representing numbers satisfying both @a and @b */ struct tnum tnum_intersect(struct tnum a, struct tnum b); +/* Returns a tnum representing numbers satisfying either @a or @b */ +struct tnum tnum_union(struct tnum t1, struct tnum t2); + /* Return @a with all but the lowest @size bytes cleared */ struct tnum tnum_cast(struct tnum a, u8 size); diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index d9328bbb3680..f8e70e9c3998 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -116,31 +116,47 @@ struct tnum tnum_xor(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } -/* Generate partial products by multiplying each bit in the multiplier (tnum a) - * with the multiplicand (tnum b), and add the partial products after - * appropriately bit-shifting them. Instead of directly performing tnum addition - * on the generated partial products, equivalenty, decompose each partial - * product into two tnums, consisting of the value-sum (acc_v) and the - * mask-sum (acc_m) and then perform tnum addition on them. The following paper - * explains the algorithm in more detail: https://arxiv.org/abs/2105.05398. +/* Perform long multiplication, iterating through the bits in a using rshift: + * - if LSB(a) is a known 0, keep current accumulator + * - if LSB(a) is a known 1, add b to current accumulator + * - if LSB(a) is unknown, take a union of the above cases. + * + * For example: + * + * acc_0: acc_1: + * + * 11 * -> 11 * -> 11 * -> union(0011, 1001) == x0x1 + * x1 01 11 + * ------ ------ ------ + * 11 11 11 + * xx 00 11 + * ------ ------ ------ + * ???? 0011 1001 */ struct tnum tnum_mul(struct tnum a, struct tnum b) { - u64 acc_v = a.value * b.value; - struct tnum acc_m = TNUM(0, 0); + struct tnum acc = TNUM(0, 0); while (a.value || a.mask) { /* LSB of tnum a is a certain 1 */ if (a.value & 1) - acc_m = tnum_add(acc_m, TNUM(0, b.mask)); + acc = tnum_add(acc, b); /* LSB of tnum a is uncertain */ - else if (a.mask & 1) - acc_m = tnum_add(acc_m, TNUM(0, b.value | b.mask)); + else if (a.mask & 1) { + /* acc = tnum_union(acc_0, acc_1), where acc_0 and + * acc_1 are partial accumulators for cases + * LSB(a) = certain 0 and LSB(a) = certain 1. + * acc_0 = acc + 0 * b = acc. + * acc_1 = acc + 1 * b = tnum_add(acc, b). + */ + + acc = tnum_union(acc, tnum_add(acc, b)); + } /* Note: no case for LSB is certain 0 */ a = tnum_rshift(a, 1); b = tnum_lshift(b, 1); } - return tnum_add(TNUM(acc_v, 0), acc_m); + return acc; } bool tnum_overlap(struct tnum a, struct tnum b) @@ -163,6 +179,19 @@ struct tnum tnum_intersect(struct tnum a, struct tnum b) return TNUM(v & ~mu, mu); } +/* Returns a tnum with the uncertainty from both a and b, and in addition, new + * uncertainty at any position that a and b disagree. This represents a + * superset of the union of the concrete sets of both a and b. Despite the + * overapproximation, it is optimal. + */ +struct tnum tnum_union(struct tnum a, struct tnum b) +{ + u64 v = a.value & b.value; + u64 mu = (a.value ^ b.value) | a.mask | b.mask; + + return TNUM(v & ~mu, mu); +} + struct tnum tnum_cast(struct tnum a, u8 size) { a.value &= (1ULL << (size * 8)) - 1; -- cgit v1.2.3 From cb4d5a6eb600a43c2e3ec7f54e06d07aa33d8062 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:28 +0000 Subject: net: add sk_drops_skbadd() helper Existing sk_drops_add() helper is renamed to sk_drops_skbadd(). Add sk_drops_add() and convert sk_drops_inc() to use it. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-3-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/skmsg.h | 2 +- include/net/sock.h | 11 ++++++++--- include/net/udp.h | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- net/mptcp/protocol.c | 2 +- 7 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 0b9095a281b8..49847888c287 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -315,7 +315,7 @@ static inline bool sk_psock_test_state(const struct sk_psock *psock, static inline void sock_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); kfree_skb(skb); } diff --git a/include/net/sock.h b/include/net/sock.h index 34d7029eb622..9edb42ff0622 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2682,9 +2682,14 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_add(struct sock *sk, int segs) +{ + atomic_add(segs, &sk->sk_drops); +} + static inline void sk_drops_inc(struct sock *sk) { - atomic_inc(&sk->sk_drops); + sk_drops_add(sk, 1); } static inline int sk_drops_read(const struct sock *sk) @@ -2704,11 +2709,11 @@ sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) sk_drops_read(sk) : 0; } -static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) +static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - atomic_add(segs, &sk->sk_drops); + sk_drops_add(sk, segs); } static inline ktime_t sock_read_timestamp(struct sock *sk) diff --git a/include/net/udp.h b/include/net/udp.h index e2af3bda90c9..7b26d4c50f33 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -627,7 +627,7 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, return segs; drop: - atomic_add(drop_count, &sk->sk_drops); + sk_drops_add(sk, drop_count); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count); kfree_skb(skb); return NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a52a747d8a55..f1be65af1a77 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4830,7 +4830,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, noinline_for_tracing static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); sk_skb_reason_drop(sk, skb, reason); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a0c93b24c6e0..7c1d612afca1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2254,7 +2254,7 @@ lookup: &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -2399,7 +2399,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8b2e7b7afbd8..b4e56b877273 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1809,7 +1809,7 @@ lookup: &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -1948,7 +1948,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f2e728239480..ad41c48126e4 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -137,7 +137,7 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); __kfree_skb(skb); } -- cgit v1.2.3 From 51132b99f01ce05f8008f0fb189d83eed484bd53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:30 +0000 Subject: udp: add drop_counters to udp socket When a packet flood hits one or more UDP sockets, many cpus have to update sk->sk_drops. This slows down other cpus, because currently sk_drops is in sock_write_rx group. Add a socket_drop_counters structure to udp sockets. Using dedicated cache lines to hold drop counters makes sure that consumers no longer suffer from false sharing if/when producers only change sk->sk_drops. This adds 128 bytes per UDP socket. Tested with the following stress test, sending about 11 Mpps to a dual socket AMD EPYC 7B13 64-Core. super_netperf 20 -t UDP_STREAM -H DUT -l10 -- -n -P,1000 -m 120 Note: due to socket lookup, only one UDP socket is receiving packets on DUT. Then measure receiver (DUT) behavior. We can see both consumer and BH handlers can process more packets per second. Before: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 615091 0.0 Udp6InErrors 3904277 0.0 Udp6RcvbufErrors 3904277 0.0 After: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 816281 0.0 Udp6InErrors 7497093 0.0 Udp6RcvbufErrors 7497093 0.0 Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-5-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/udp.h | 1 + include/net/udp.h | 1 + tools/testing/selftests/bpf/progs/bpf_iter_udp4.c | 3 ++- tools/testing/selftests/bpf/progs/bpf_iter_udp6.c | 4 ++-- 4 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 4e1a672af4c5..981506be1e15 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,6 +108,7 @@ struct udp_sock { * the last UDP socket cacheline. */ struct hlist_node tunnel_list; + struct socket_drop_counters drop_counters; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index 7b26d4c50f33..93b159f30e88 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -288,6 +288,7 @@ static inline void udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); + sk->sk_drop_counters = &up->drop_counters; skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c index ffbd4b116d17..23b2aa2604de 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c @@ -64,7 +64,8 @@ int dump_udp4(struct bpf_iter__udp *ctx) 0, 0L, 0, ctx->uid, 0, sock_i_ino(&inet->sk), inet->sk.sk_refcnt.refs.counter, udp_sk, - inet->sk.sk_drops.counter); + udp_sk->drop_counters.drops0.counter + + udp_sk->drop_counters.drops1.counter); return 0; } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c index 47ff7754f4fd..c48b05aa2a4b 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c @@ -72,7 +72,7 @@ int dump_udp6(struct bpf_iter__udp *ctx) 0, 0L, 0, ctx->uid, 0, sock_i_ino(&inet->sk), inet->sk.sk_refcnt.refs.counter, udp_sk, - inet->sk.sk_drops.counter); - + udp_sk->drop_counters.drops0.counter + + udp_sk->drop_counters.drops1.counter); return 0; } -- cgit v1.2.3 From b81aa23234d94d99951761d9864061d774633ba9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Aug 2025 12:50:31 +0000 Subject: inet: raw: add drop_counters to raw sockets When a packet flood hits one or more RAW sockets, many cpus have to update sk->sk_drops. This slows down other cpus, because currently sk_drops is in sock_write_rx group. Add a socket_drop_counters structure to raw sockets. Using dedicated cache lines to hold drop counters makes sure that consumers no longer suffer from false sharing if/when producers only change sk->sk_drops. This adds 128 bytes per RAW socket. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250826125031.1578842-6-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 2 +- include/net/raw.h | 1 + net/ipv4/raw.c | 1 + net/ipv6/raw.c | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index bc6ec2959173..261d02efb615 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -295,7 +295,7 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - + struct socket_drop_counters drop_counters; struct ipv6_pinfo inet6; }; diff --git a/include/net/raw.h b/include/net/raw.h index 32a61481a253..d52709139060 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,6 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; + struct socket_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0f9f02f6146e..d54ebb7df966 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -793,6 +793,7 @@ static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4026192143ec..4ae07a67b4d4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1175,6 +1175,7 @@ static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; -- cgit v1.2.3 From 3ea299d3dccdb8554057d0a87552e7673baea95d Mon Sep 17 00:00:00 2001 From: Gabor Juhos Date: Mon, 11 Aug 2025 09:40:40 +0200 Subject: mtd: nand: qpic-common: remove a bunch of unused defines A bunch of definitions in the 'nand-qpic-common.h' header became unused after the conversion of the 'qcom_nandc' and 'spi-qpic-snand' drivers to use the FIELD_PREP() macro, so remove those. No functional changes. Signed-off-by: Gabor Juhos Signed-off-by: Miquel Raynal --- include/linux/mtd/nand-qpic-common.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand-qpic-common.h b/include/linux/mtd/nand-qpic-common.h index 4e694b1aabbd..e8201d1b7cf9 100644 --- a/include/linux/mtd/nand-qpic-common.h +++ b/include/linux/mtd/nand-qpic-common.h @@ -71,14 +71,10 @@ /* NAND_DEVn_CFG0 bits */ #define DISABLE_STATUS_AFTER_WRITE BIT(4) -#define CW_PER_PAGE 6 #define CW_PER_PAGE_MASK GENMASK(8, 6) -#define UD_SIZE_BYTES 9 #define UD_SIZE_BYTES_MASK GENMASK(18, 9) #define ECC_PARITY_SIZE_BYTES_RS GENMASK(22, 19) -#define SPARE_SIZE_BYTES 23 #define SPARE_SIZE_BYTES_MASK GENMASK(26, 23) -#define NUM_ADDR_CYCLES 27 #define NUM_ADDR_CYCLES_MASK GENMASK(29, 27) #define STATUS_BFR_READ BIT(30) #define SET_RD_MODE_AFTER_STATUS BIT(31) @@ -86,26 +82,20 @@ /* NAND_DEVn_CFG0 bits */ #define DEV0_CFG1_ECC_DISABLE BIT(0) #define WIDE_FLASH BIT(1) -#define NAND_RECOVERY_CYCLES 2 #define NAND_RECOVERY_CYCLES_MASK GENMASK(4, 2) #define CS_ACTIVE_BSY BIT(5) -#define BAD_BLOCK_BYTE_NUM 6 #define BAD_BLOCK_BYTE_NUM_MASK GENMASK(15, 6) #define BAD_BLOCK_IN_SPARE_AREA BIT(16) -#define WR_RD_BSY_GAP 17 #define WR_RD_BSY_GAP_MASK GENMASK(22, 17) #define ENABLE_BCH_ECC BIT(27) /* NAND_DEV0_ECC_CFG bits */ #define ECC_CFG_ECC_DISABLE BIT(0) #define ECC_SW_RESET BIT(1) -#define ECC_MODE 4 #define ECC_MODE_MASK GENMASK(5, 4) #define ECC_MODE_4BIT 0 #define ECC_MODE_8BIT 1 -#define ECC_PARITY_SIZE_BYTES_BCH 8 #define ECC_PARITY_SIZE_BYTES_BCH_MASK GENMASK(12, 8) -#define ECC_NUM_DATA_BYTES 16 #define ECC_NUM_DATA_BYTES_MASK GENMASK(25, 16) #define ECC_FORCE_CLK_OPEN BIT(30) @@ -120,7 +110,6 @@ #define SEQ_READ_START_VLD BIT(4) /* NAND_EBI2_ECC_BUF_CFG bits */ -#define NUM_STEPS 0 #define NUM_STEPS_MASK GENMASK(9, 0) /* NAND_ERASED_CW_DETECT_CFG bits */ @@ -141,11 +130,8 @@ #define ERASED_CW (CODEWORD_ALL_ERASED | CODEWORD_ERASED) /* NAND_READ_LOCATION_n bits */ -#define READ_LOCATION_OFFSET 0 #define READ_LOCATION_OFFSET_MASK GENMASK(9, 0) -#define READ_LOCATION_SIZE 16 #define READ_LOCATION_SIZE_MASK GENMASK(25, 16) -#define READ_LOCATION_LAST 31 #define READ_LOCATION_LAST_MASK BIT(31) /* Version Mask */ -- cgit v1.2.3 From 5f284dc15ca8695d0394414045ac64616a3b0e69 Mon Sep 17 00:00:00 2001 From: Tianling Shen Date: Mon, 25 Aug 2025 01:00:13 +0800 Subject: mtd: spinand: add support for FudanMicro FM25S01A Add support for FudanMicro FM25S01A SPI NAND. Datasheet: http://eng.fmsh.com/nvm/FM25S01A_ds_eng.pdf Signed-off-by: Tianling Shen Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/Makefile | 2 +- drivers/mtd/nand/spi/core.c | 1 + drivers/mtd/nand/spi/fmsh.c | 74 +++++++++++++++++++++++++++++++++++++++++++ include/linux/mtd/spinand.h | 1 + 4 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 drivers/mtd/nand/spi/fmsh.c (limited to 'include/linux') diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile index 258da42451a4..6d3d203df048 100644 --- a/drivers/mtd/nand/spi/Makefile +++ b/drivers/mtd/nand/spi/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 spinand-objs := core.o otp.o -spinand-objs += alliancememory.o ato.o esmt.o foresee.o gigadevice.o macronix.o +spinand-objs += alliancememory.o ato.o esmt.o fmsh.o foresee.o gigadevice.o macronix.o spinand-objs += micron.o paragon.o skyhigh.o toshiba.o winbond.o xtx.o obj-$(CONFIG_MTD_SPI_NAND) += spinand.o diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c index b0898990b2a5..ea47028d021a 100644 --- a/drivers/mtd/nand/spi/core.c +++ b/drivers/mtd/nand/spi/core.c @@ -1184,6 +1184,7 @@ static const struct spinand_manufacturer *spinand_manufacturers[] = { &alliancememory_spinand_manufacturer, &ato_spinand_manufacturer, &esmt_c8_spinand_manufacturer, + &fmsh_spinand_manufacturer, &foresee_spinand_manufacturer, &gigadevice_spinand_manufacturer, ¯onix_spinand_manufacturer, diff --git a/drivers/mtd/nand/spi/fmsh.c b/drivers/mtd/nand/spi/fmsh.c new file mode 100644 index 000000000000..8b2097bfc771 --- /dev/null +++ b/drivers/mtd/nand/spi/fmsh.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020-2021 Rockchip Electronics Co., Ltd. + * + * Author: Dingqiang Lin + */ + +#include +#include +#include + +#define SPINAND_MFR_FMSH 0xA1 + +static SPINAND_OP_VARIANTS(read_cache_variants, + SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0, 0), + SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0, 0)); + +static SPINAND_OP_VARIANTS(write_cache_variants, + SPINAND_PROG_LOAD_1S_1S_4S_OP(true, 0, NULL, 0), + SPINAND_PROG_LOAD_1S_1S_1S_OP(true, 0, NULL, 0)); + +static SPINAND_OP_VARIANTS(update_cache_variants, + SPINAND_PROG_LOAD_1S_1S_4S_OP(false, 0, NULL, 0), + SPINAND_PROG_LOAD_1S_1S_1S_OP(false, 0, NULL, 0)); + +static int fm25s01a_ooblayout_ecc(struct mtd_info *mtd, int section, + struct mtd_oob_region *region) +{ + return -ERANGE; +} + +static int fm25s01a_ooblayout_free(struct mtd_info *mtd, int section, + struct mtd_oob_region *region) +{ + if (section) + return -ERANGE; + + region->offset = 2; + region->length = 62; + + return 0; +} + +static const struct mtd_ooblayout_ops fm25s01a_ooblayout = { + .ecc = fm25s01a_ooblayout_ecc, + .free = fm25s01a_ooblayout_free, +}; + +static const struct spinand_info fmsh_spinand_table[] = { + SPINAND_INFO("FM25S01A", + SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xE4), + NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1), + NAND_ECCREQ(1, 512), + SPINAND_INFO_OP_VARIANTS(&read_cache_variants, + &write_cache_variants, + &update_cache_variants), + SPINAND_HAS_QE_BIT, + SPINAND_ECCINFO(&fm25s01a_ooblayout, NULL)), +}; + +static const struct spinand_manufacturer_ops fmsh_spinand_manuf_ops = { +}; + +const struct spinand_manufacturer fmsh_spinand_manufacturer = { + .id = SPINAND_MFR_FMSH, + .name = "Fudan Micro", + .chips = fmsh_spinand_table, + .nchips = ARRAY_SIZE(fmsh_spinand_table), + .ops = &fmsh_spinand_manuf_ops, +}; diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 27a45bdab7ec..927c10d78769 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -355,6 +355,7 @@ struct spinand_manufacturer { extern const struct spinand_manufacturer alliancememory_spinand_manufacturer; extern const struct spinand_manufacturer ato_spinand_manufacturer; extern const struct spinand_manufacturer esmt_c8_spinand_manufacturer; +extern const struct spinand_manufacturer fmsh_spinand_manufacturer; extern const struct spinand_manufacturer foresee_spinand_manufacturer; extern const struct spinand_manufacturer gigadevice_spinand_manufacturer; extern const struct spinand_manufacturer macronix_spinand_manufacturer; -- cgit v1.2.3 From 30c2b98aa84c76f2ae60e66dd4ec2d9497713359 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 28 Aug 2025 12:33:17 +0530 Subject: x86/apic: Add new driver for Secure AVIC The Secure AVIC feature provides SEV-SNP guests hardware acceleration for performance sensitive APIC accesses while securely managing the guest-owned APIC state through the use of a private APIC backing page. This helps prevent the hypervisor from generating unexpected interrupts for a vCPU or otherwise violate architectural assumptions around the APIC behavior. Add a new x2APIC driver that will serve as the base of the Secure AVIC support. It is initially the same as the x2APIC physical driver (without IPI callbacks), but will be modified as features are implemented. As the new driver does not implement Secure AVIC features yet, if the hypervisor sets the Secure AVIC bit in SEV_STATUS, maintain the existing behavior to enforce the guest termination. [ bp: Massage commit message. ] Co-developed-by: Kishon Vijay Abraham I Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Neeraj Upadhyay Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tianyu Lan Link: https://lore.kernel.org/20250828070334.208401-2-Neeraj.Upadhyay@amd.com --- arch/x86/Kconfig | 13 ++++++++ arch/x86/boot/compressed/sev.c | 1 + arch/x86/coco/core.c | 3 ++ arch/x86/coco/sev/core.c | 1 + arch/x86/include/asm/msr-index.h | 4 ++- arch/x86/kernel/apic/Makefile | 1 + arch/x86/kernel/apic/x2apic_savic.c | 63 +++++++++++++++++++++++++++++++++++++ include/linux/cc_platform.h | 8 +++++ 8 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/apic/x2apic_savic.c (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100..e32952728d9a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -483,6 +483,19 @@ config X86_X2APIC If in doubt, say Y. +config AMD_SECURE_AVIC + bool "AMD Secure AVIC" + depends on AMD_MEM_ENCRYPT && X86_X2APIC + help + Enable this to get AMD Secure AVIC support on guests that have this feature. + + AMD Secure AVIC provides hardware acceleration for performance sensitive + APIC accesses and support for managing guest owned APIC state for SEV-SNP + guests. Secure AVIC does not support xAPIC mode. It has functional + dependency on x2apic being enabled in the guest. + + If you don't know what to do here, say N. + config X86_POSTED_MSI bool "Enable MSI and MSI-x delivery by posted interrupts" depends on X86_64 && IRQ_REMAP diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index fd1b67dfea22..74e083feb2d9 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -235,6 +235,7 @@ bool sev_es_check_ghcb_fault(unsigned long address) MSR_AMD64_SNP_VMSA_REG_PROT | \ MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ + MSR_AMD64_SNP_SECURE_AVIC | \ MSR_AMD64_SNP_RESERVED_MASK) /* diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index d4610af68114..989ca9f72ba3 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -104,6 +104,9 @@ static bool noinstr amd_cc_platform_has(enum cc_attr attr) case CC_ATTR_HOST_SEV_SNP: return cc_flags.host_sev_snp; + case CC_ATTR_SNP_SECURE_AVIC: + return sev_status & MSR_AMD64_SNP_SECURE_AVIC; + default: return false; } diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 14ef5908fb27..f7a549f650e9 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -79,6 +79,7 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_IBS_VIRT_BIT] = "IBSVirt", [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", + [MSR_AMD64_SNP_SECURE_AVIC_BIT] = "SecureAVIC", }; /* diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..2a6d4fd8659a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -699,7 +699,9 @@ #define MSR_AMD64_SNP_VMSA_REG_PROT BIT_ULL(MSR_AMD64_SNP_VMSA_REG_PROT_BIT) #define MSR_AMD64_SNP_SMT_PROT_BIT 17 #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) -#define MSR_AMD64_SNP_RESV_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC_BIT 18 +#define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) +#define MSR_AMD64_SNP_RESV_BIT 19 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_RMP_BASE 0xc0010132 #define MSR_AMD64_RMP_END 0xc0010133 diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 52d1808ee360..581db89477f9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,6 +18,7 @@ ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_AMD_SECURE_AVIC) += x2apic_savic.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o diff --git a/arch/x86/kernel/apic/x2apic_savic.c b/arch/x86/kernel/apic/x2apic_savic.c new file mode 100644 index 000000000000..bea844f28192 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_savic.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure AVIC Support (SEV-SNP Guests) + * + * Copyright (C) 2024 Advanced Micro Devices, Inc. + * + * Author: Neeraj Upadhyay + */ + +#include + +#include +#include + +#include "local.h" + +static int savic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && cc_platform_has(CC_ATTR_SNP_SECURE_AVIC); +} + +static int savic_probe(void) +{ + if (!cc_platform_has(CC_ATTR_SNP_SECURE_AVIC)) + return 0; + + if (!x2apic_mode) { + pr_err("Secure AVIC enabled in non x2APIC mode\n"); + snp_abort(); + /* unreachable */ + } + + return 1; +} + +static struct apic apic_x2apic_savic __ro_after_init = { + + .name = "secure avic x2apic", + .probe = savic_probe, + .acpi_madt_oem_check = savic_acpi_madt_oem_check, + + .dest_mode_logical = false, + + .disable_esr = 0, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + + .max_apic_id = UINT_MAX, + .x2apic_set_max_apicid = true, + .get_apic_id = x2apic_get_apic_id, + + .calc_dest_apicid = apic_default_calc_apicid, + + .nmi_to_offline_cpu = true, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi = native_apic_msr_eoi, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, +}; + +apic_driver(apic_x2apic_savic); diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 0bf7d33a1048..7fcec025c5e0 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -96,6 +96,14 @@ enum cc_attr { * enabled to run SEV-SNP guests. */ CC_ATTR_HOST_SEV_SNP, + + /** + * @CC_ATTR_SNP_SECURE_AVIC: Secure AVIC mode is active. + * + * The host kernel is running with the necessary features enabled + * to run SEV-SNP guests with full Secure AVIC capabilities. + */ + CC_ATTR_SNP_SECURE_AVIC, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM -- cgit v1.2.3 From 9a98f9e84cfbeaa51af42ba2b8bbbde046c709a7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Aug 2025 11:39:01 -0400 Subject: fs: make the i_state flags an enum Adjusting i_state flags always means updating the values manually. Bring these forward into the 2020's and make a nice clean macro for defining the i_state values as an enum, providing __ variants for the cases where we need the bit position instead of the actual value, and leaving the actual NAME as the 1U << bit value. Reviewed-by: Christian Brauner Signed-off-by: Josef Bacik Link: https://lore.kernel.org/0da9348da6ece0dce12fccec07b1dd2b8e4cfdab.1756222464.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 231 +++++++++++++++++++++++++++-------------------------- 1 file changed, 119 insertions(+), 112 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 12ecc6b0e6f9..c34554d8c4fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -664,6 +664,124 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 +/* + * Inode state bits. Protected by inode->i_lock + * + * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, + * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. + * + * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, + * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at + * various stages of removing an inode. + * + * Two bits are used for locking and completion notification, I_NEW and I_SYNC. + * + * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on + * fdatasync() (unless I_DIRTY_DATASYNC is also set). + * Timestamp updates are the usual cause. + * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of + * these changes separately from I_DIRTY_SYNC so that we + * don't have to write inode on fdatasync() when only + * e.g. the timestamps have changed. + * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. + * I_DIRTY_TIME The inode itself has dirty timestamps, and the + * lazytime mount option is enabled. We keep track of this + * separately from I_DIRTY_SYNC in order to implement + * lazytime. This gets cleared if I_DIRTY_INODE + * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But + * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already + * in place because writeback might already be in progress + * and we don't want to lose the time update + * I_NEW Serves as both a mutex and completion notification. + * New inodes set I_NEW. If two processes both create + * the same inode, one of them will release its inode and + * wait for I_NEW to be released before returning. + * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can + * also cause waiting on I_NEW, without I_NEW actually + * being set. find_inode() uses this to prevent returning + * nearly-dead inodes. + * I_WILL_FREE Must be set when calling write_inode_now() if i_count + * is zero. I_FREEING must be set when I_WILL_FREE is + * cleared. + * I_FREEING Set when inode is about to be freed but still has dirty + * pages or buffers attached or the inode itself is still + * dirty. + * I_CLEAR Added by clear_inode(). In this state the inode is + * clean and can be destroyed. Inode keeps I_FREEING. + * + * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are + * prohibited for many purposes. iget() must wait for + * the inode to be completely released, then create it + * anew. Other functions will just ignore such inodes, + * if appropriate. I_NEW is used for waiting. + * + * I_SYNC Writeback of inode is running. The bit is set during + * data writeback, and cleared with a wakeup on the bit + * address once it is done. The bit is also used to pin + * the inode in memory for flusher thread. + * + * I_REFERENCED Marks the inode as recently references on the LRU list. + * + * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to + * synchronize competing switching instances and to tell + * wb stat updates to grab the i_pages lock. See + * inode_switch_wbs_work_fn() for details. + * + * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper + * and work dirs among overlayfs mounts. + * + * I_CREATING New object's inode in the middle of setting up. + * + * I_DONTCACHE Evict inode as soon as it is not used anymore. + * + * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. + * Used to detect that mark_inode_dirty() should not move + * inode between dirty lists. + * + * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. + * + * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding + * i_count. + * + * Q: What is the difference between I_WILL_FREE and I_FREEING? + * + * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait + * upon. There's one free address left. + */ + +enum inode_state_bits { + __I_NEW = 0U, + __I_SYNC = 1U, + __I_LRU_ISOLATING = 2U + /* reserved wait address bit 3 */ +}; + +enum inode_state_flags_t { + I_NEW = (1U << __I_NEW), + I_SYNC = (1U << __I_SYNC), + I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), + /* reserved flag bit 3 */ + I_DIRTY_SYNC = (1U << 4), + I_DIRTY_DATASYNC = (1U << 5), + I_DIRTY_PAGES = (1U << 6), + I_WILL_FREE = (1U << 7), + I_FREEING = (1U << 8), + I_CLEAR = (1U << 9), + I_REFERENCED = (1U << 10), + I_LINKABLE = (1U << 11), + I_DIRTY_TIME = (1U << 12), + I_WB_SWITCH = (1U << 13), + I_OVL_INUSE = (1U << 14), + I_CREATING = (1U << 15), + I_DONTCACHE = (1U << 16), + I_SYNC_QUEUED = (1U << 17), + I_PINNING_NETFS_WB = (1U << 18) +}; + +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) +#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) +#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) + /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -722,7 +840,7 @@ struct inode { #endif /* Misc */ - u32 i_state; + enum inode_state_flags_t i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -2482,117 +2600,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, }; } -/* - * Inode state bits. Protected by inode->i_lock - * - * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, - * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. - * - * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, - * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at - * various stages of removing an inode. - * - * Two bits are used for locking and completion notification, I_NEW and I_SYNC. - * - * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on - * fdatasync() (unless I_DIRTY_DATASYNC is also set). - * Timestamp updates are the usual cause. - * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of - * these changes separately from I_DIRTY_SYNC so that we - * don't have to write inode on fdatasync() when only - * e.g. the timestamps have changed. - * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. - * I_DIRTY_TIME The inode itself has dirty timestamps, and the - * lazytime mount option is enabled. We keep track of this - * separately from I_DIRTY_SYNC in order to implement - * lazytime. This gets cleared if I_DIRTY_INODE - * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But - * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already - * in place because writeback might already be in progress - * and we don't want to lose the time update - * I_NEW Serves as both a mutex and completion notification. - * New inodes set I_NEW. If two processes both create - * the same inode, one of them will release its inode and - * wait for I_NEW to be released before returning. - * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can - * also cause waiting on I_NEW, without I_NEW actually - * being set. find_inode() uses this to prevent returning - * nearly-dead inodes. - * I_WILL_FREE Must be set when calling write_inode_now() if i_count - * is zero. I_FREEING must be set when I_WILL_FREE is - * cleared. - * I_FREEING Set when inode is about to be freed but still has dirty - * pages or buffers attached or the inode itself is still - * dirty. - * I_CLEAR Added by clear_inode(). In this state the inode is - * clean and can be destroyed. Inode keeps I_FREEING. - * - * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are - * prohibited for many purposes. iget() must wait for - * the inode to be completely released, then create it - * anew. Other functions will just ignore such inodes, - * if appropriate. I_NEW is used for waiting. - * - * I_SYNC Writeback of inode is running. The bit is set during - * data writeback, and cleared with a wakeup on the bit - * address once it is done. The bit is also used to pin - * the inode in memory for flusher thread. - * - * I_REFERENCED Marks the inode as recently references on the LRU list. - * - * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to - * synchronize competing switching instances and to tell - * wb stat updates to grab the i_pages lock. See - * inode_switch_wbs_work_fn() for details. - * - * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper - * and work dirs among overlayfs mounts. - * - * I_CREATING New object's inode in the middle of setting up. - * - * I_DONTCACHE Evict inode as soon as it is not used anymore. - * - * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. - * Used to detect that mark_inode_dirty() should not move - * inode between dirty lists. - * - * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. - * - * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding - * i_count. - * - * Q: What is the difference between I_WILL_FREE and I_FREEING? - * - * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait - * upon. There's one free address left. - */ -#define __I_NEW 0 -#define I_NEW (1 << __I_NEW) -#define __I_SYNC 1 -#define I_SYNC (1 << __I_SYNC) -#define __I_LRU_ISOLATING 2 -#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) - -#define I_DIRTY_SYNC (1 << 3) -#define I_DIRTY_DATASYNC (1 << 4) -#define I_DIRTY_PAGES (1 << 5) -#define I_WILL_FREE (1 << 6) -#define I_FREEING (1 << 7) -#define I_CLEAR (1 << 8) -#define I_REFERENCED (1 << 9) -#define I_LINKABLE (1 << 10) -#define I_DIRTY_TIME (1 << 11) -#define I_WB_SWITCH (1 << 12) -#define I_OVL_INUSE (1 << 13) -#define I_CREATING (1 << 14) -#define I_DONTCACHE (1 << 15) -#define I_SYNC_QUEUED (1 << 16) -#define I_PINNING_NETFS_WB (1 << 17) - -#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) -#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) -#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) - extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { -- cgit v1.2.3 From db2ab24a341ce89351a1bede37a96a3e3ce1726a Mon Sep 17 00:00:00 2001 From: Lauri Vasama Date: Wed, 27 Aug 2025 16:39:00 +0300 Subject: Add RWF_NOSIGNAL flag for pwritev2 For a user mode library to avoid generating SIGPIPE signals (e.g. because this behaviour is not portable across operating systems) is cumbersome. It is generally bad form to change the process-wide signal mask in a library, so a local solution is needed instead. For I/O performed directly using system calls (synchronous or readiness based asynchronous) this currently involves applying a thread-specific signal mask before the operation and reverting it afterwards. This can be avoided when it is known that the file descriptor refers to neither a pipe nor a socket, but a conservative implementation must always apply the mask. This incurs the cost of two additional system calls. In the case of sockets, the existing MSG_NOSIGNAL flag can be used with send. For asynchronous I/O performed using io_uring, currently the only option (apart from MSG_NOSIGNAL for sockets), is to mask SIGPIPE entirely in the call to io_uring_enter. Thankfully io_uring_enter takes a signal mask, so only a single syscall is needed. However, copying the signal mask on every call incurs a non-zero performance penalty. Furthermore, this mask applies to all completions, meaning that if the non-signaling behaviour is desired only for some subset of operations, the desired signals must be raised manually from user-mode depending on the completed operation. Add RWF_NOSIGNAL flag for pwritev2. This flag prevents the SIGPIPE signal from being raised when writing on disconnected pipes or sockets. The flag is handled directly by the pipe filesystem and converted to the existing MSG_NOSIGNAL flag for sockets. Signed-off-by: Lauri Vasama Link: https://lore.kernel.org/20250827133901.1820771-1-git@vasama.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/pipe.c | 6 ++++-- include/linux/fs.h | 1 + include/uapi/linux/fs.h | 5 ++++- net/socket.c | 3 +++ 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index 731622d0738d..42fead1efe52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&pipe->mutex); if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; diff --git a/include/linux/fs.h b/include/linux/fs.h index 780e9c774c54..34693cae15a2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,6 +356,7 @@ struct readahead_control; #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE +#define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0bd678a4a10e..beb4c2d1e41c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t; /* buffered IO that drops the cache after reading or writing data */ #define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) +/* prevent pipe and socket writes from raising SIGPIPE */ +#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ - RWF_DONTCACHE) + RWF_DONTCACHE | RWF_NOSIGNAL) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/net/socket.c b/net/socket.c index 682969deaed3..bac335ecee4c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1176,6 +1176,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; + if (iocb->ki_flags & IOCB_NOSIGNAL) + msg.msg_flags |= MSG_NOSIGNAL; + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; -- cgit v1.2.3 From 72cdc67e7fa74931b055df3a76852bab551f1a04 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Thu, 28 Aug 2025 09:20:16 +0800 Subject: pppoe: remove rwlock usage Like ppp_generic.c, convert the PPPoE socket hash table to use RCU for lookups and a spinlock for updates. This removes rwlock usage and allows lockless readers on the fast path. - Mark hash table and list pointers as __rcu. - Use spin_lock() to protect writers. - Readers use rcu_dereference() under rcu_read_lock(). All known callers of get_item() already hold the RCU read lock, so no additional locking is needed. - get_item() now uses refcount_inc_not_zero() instead of sock_hold() to safely take a reference. This prevents crashes if a socket is already in the process of being freed (sk_refcnt == 0). - Set SOCK_RCU_FREE to defer socket freeing until after an RCU grace period. - Move skb_queue_purge() into sk_destruct callback to ensure purge happens after an RCU grace period. Signed-off-by: Qingfang Deng Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250828012018.15922-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ppp/pppoe.c | 94 +++++++++++++++++++++++++++--------------------- include/linux/if_pppox.h | 2 +- 2 files changed, 54 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 410effa42ade..54522b26b728 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -100,8 +100,8 @@ struct pppoe_net { * as well, moreover in case of SMP less locking * controversy here */ - struct pppox_sock *hash_table[PPPOE_HASH_SIZE]; - rwlock_t hash_lock; + struct pppox_sock __rcu *hash_table[PPPOE_HASH_SIZE]; + spinlock_t hash_lock; }; /* @@ -162,13 +162,13 @@ static struct pppox_sock *__get_item(struct pppoe_net *pn, __be16 sid, int hash = hash_item(sid, addr); struct pppox_sock *ret; - ret = pn->hash_table[hash]; + ret = rcu_dereference(pn->hash_table[hash]); while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) return ret; - ret = ret->next; + ret = rcu_dereference(ret->next); } return NULL; @@ -177,19 +177,20 @@ static struct pppox_sock *__get_item(struct pppoe_net *pn, __be16 sid, static int __set_item(struct pppoe_net *pn, struct pppox_sock *po) { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); - struct pppox_sock *ret; + struct pppox_sock *ret, *first; - ret = pn->hash_table[hash]; + first = rcu_dereference_protected(pn->hash_table[hash], lockdep_is_held(&pn->hash_lock)); + ret = first; while (ret) { if (cmp_2_addr(&ret->pppoe_pa, &po->pppoe_pa) && ret->pppoe_ifindex == po->pppoe_ifindex) return -EALREADY; - ret = ret->next; + ret = rcu_dereference_protected(ret->next, lockdep_is_held(&pn->hash_lock)); } - po->next = pn->hash_table[hash]; - pn->hash_table[hash] = po; + RCU_INIT_POINTER(po->next, first); + rcu_assign_pointer(pn->hash_table[hash], po); return 0; } @@ -198,20 +199,24 @@ static void __delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { int hash = hash_item(sid, addr); - struct pppox_sock *ret, **src; + struct pppox_sock *ret, __rcu **src; - ret = pn->hash_table[hash]; + ret = rcu_dereference_protected(pn->hash_table[hash], lockdep_is_held(&pn->hash_lock)); src = &pn->hash_table[hash]; while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) { - *src = ret->next; + struct pppox_sock *next; + + next = rcu_dereference_protected(ret->next, + lockdep_is_held(&pn->hash_lock)); + rcu_assign_pointer(*src, next); break; } src = &ret->next; - ret = ret->next; + ret = rcu_dereference_protected(ret->next, lockdep_is_held(&pn->hash_lock)); } } @@ -225,11 +230,9 @@ static inline struct pppox_sock *get_item(struct pppoe_net *pn, __be16 sid, { struct pppox_sock *po; - read_lock_bh(&pn->hash_lock); po = __get_item(pn, sid, addr, ifindex); - if (po) - sock_hold(sk_pppox(po)); - read_unlock_bh(&pn->hash_lock); + if (po && !refcount_inc_not_zero(&sk_pppox(po)->sk_refcnt)) + po = NULL; return po; } @@ -258,9 +261,9 @@ static inline struct pppox_sock *get_item_by_addr(struct net *net, static inline void delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { - write_lock_bh(&pn->hash_lock); + spin_lock(&pn->hash_lock); __delete_item(pn, sid, addr, ifindex); - write_unlock_bh(&pn->hash_lock); + spin_unlock(&pn->hash_lock); } /*************************************************************************** @@ -276,14 +279,16 @@ static void pppoe_flush_dev(struct net_device *dev) int i; pn = pppoe_pernet(dev_net(dev)); - write_lock_bh(&pn->hash_lock); + spin_lock(&pn->hash_lock); for (i = 0; i < PPPOE_HASH_SIZE; i++) { - struct pppox_sock *po = pn->hash_table[i]; + struct pppox_sock *po = rcu_dereference_protected(pn->hash_table[i], + lockdep_is_held(&pn->hash_lock)); struct sock *sk; while (po) { while (po && po->pppoe_dev != dev) { - po = po->next; + po = rcu_dereference_protected(po->next, + lockdep_is_held(&pn->hash_lock)); } if (!po) @@ -300,7 +305,7 @@ static void pppoe_flush_dev(struct net_device *dev) */ sock_hold(sk); - write_unlock_bh(&pn->hash_lock); + spin_unlock(&pn->hash_lock); lock_sock(sk); if (po->pppoe_dev == dev && @@ -320,11 +325,12 @@ static void pppoe_flush_dev(struct net_device *dev) */ BUG_ON(pppoe_pernet(dev_net(dev)) == NULL); - write_lock_bh(&pn->hash_lock); - po = pn->hash_table[i]; + spin_lock(&pn->hash_lock); + po = rcu_dereference_protected(pn->hash_table[i], + lockdep_is_held(&pn->hash_lock)); } } - write_unlock_bh(&pn->hash_lock); + spin_unlock(&pn->hash_lock); } static int pppoe_device_event(struct notifier_block *this, @@ -528,6 +534,11 @@ static struct proto pppoe_sk_proto __read_mostly = { .obj_size = sizeof(struct pppox_sock), }; +static void pppoe_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); +} + /*********************************************************************** * * Initialize a new struct sock. @@ -542,11 +553,13 @@ static int pppoe_create(struct net *net, struct socket *sock, int kern) return -ENOMEM; sock_init_data(sock, sk); + sock_set_flag(sk, SOCK_RCU_FREE); sock->state = SS_UNCONNECTED; sock->ops = &pppoe_ops; sk->sk_backlog_rcv = pppoe_rcv_core; + sk->sk_destruct = pppoe_destruct; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_family = PF_PPPOX; @@ -599,7 +612,6 @@ static int pppoe_release(struct socket *sock) sock_orphan(sk); sock->sk = NULL; - skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); @@ -681,9 +693,9 @@ static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, &sp->sa_addr.pppoe, sizeof(struct pppoe_addr)); - write_lock_bh(&pn->hash_lock); + spin_lock(&pn->hash_lock); error = __set_item(pn, po); - write_unlock_bh(&pn->hash_lock); + spin_unlock(&pn->hash_lock); if (error < 0) goto err_put; @@ -1052,11 +1064,11 @@ static inline struct pppox_sock *pppoe_get_idx(struct pppoe_net *pn, loff_t pos) int i; for (i = 0; i < PPPOE_HASH_SIZE; i++) { - po = pn->hash_table[i]; + po = rcu_dereference(pn->hash_table[i]); while (po) { if (!pos--) goto out; - po = po->next; + po = rcu_dereference(po->next); } } @@ -1065,19 +1077,19 @@ out: } static void *pppoe_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(pn->hash_lock) + __acquires(RCU) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); loff_t l = *pos; - read_lock_bh(&pn->hash_lock); + rcu_read_lock(); return l ? pppoe_get_idx(pn, --l) : SEQ_START_TOKEN; } static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); - struct pppox_sock *po; + struct pppox_sock *po, *next; ++*pos; if (v == SEQ_START_TOKEN) { @@ -1085,14 +1097,15 @@ static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos) goto out; } po = v; - if (po->next) - po = po->next; + next = rcu_dereference(po->next); + if (next) + po = next; else { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); po = NULL; while (++hash < PPPOE_HASH_SIZE) { - po = pn->hash_table[hash]; + po = rcu_dereference(pn->hash_table[hash]); if (po) break; } @@ -1103,10 +1116,9 @@ out: } static void pppoe_seq_stop(struct seq_file *seq, void *v) - __releases(pn->hash_lock) + __releases(RCU) { - struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); - read_unlock_bh(&pn->hash_lock); + rcu_read_unlock(); } static const struct seq_operations pppoe_seq_ops = { @@ -1149,7 +1161,7 @@ static __net_init int pppoe_init_net(struct net *net) struct pppoe_net *pn = pppoe_pernet(net); struct proc_dir_entry *pde; - rwlock_init(&pn->hash_lock); + spin_lock_init(&pn->hash_lock); pde = proc_create_net("pppoe", 0444, net->proc_net, &pppoe_seq_ops, sizeof(struct seq_net_private)); diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index ff3beda1312c..db45d6f1c4f4 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -43,7 +43,7 @@ struct pppox_sock { /* struct sock must be the first member of pppox_sock */ struct sock sk; struct ppp_channel chan; - struct pppox_sock *next; /* for hash table */ + struct pppox_sock __rcu *next; /* for hash table */ union { struct pppoe_opt pppoe; struct pptp_opt pptp; -- cgit v1.2.3 From 9529320ad64e614cfaf96e6b8e3d8c0a1245160c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 10:27:37 +0000 Subject: inet_diag: change inet_diag_bc_sk() first argument We want to have access to the inet_diag_dump_data structure in the following patch. This patch removes duplication in callers. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250828102738.2065992-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 2 +- net/ipv4/inet_diag.c | 3 ++- net/ipv4/raw_diag.c | 10 +++------- net/ipv4/tcp_diag.c | 8 +++----- net/ipv4/udp_diag.c | 10 +++------- net/mptcp/mptcp_diag.c | 15 ++++----------- 6 files changed, 16 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 30bf8f7ea62b..86a0641ec36e 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -46,7 +46,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin); -int inet_diag_bc_sk(const struct nlattr *_bc, struct sock *sk); +int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk); void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3827e9979d4f..117103042687 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -591,8 +591,9 @@ static void entry_fill_addrs(struct inet_diag_entry *entry, } } -int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) +int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk) { + const struct nlattr *bc = cb_data->inet_diag_nla_bc; const struct inet_sock *inet = inet_sk(sk); struct inet_diag_entry entry; diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index cc793bd8de25..943e5998e0ad 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -126,9 +126,9 @@ static int raw_diag_dump_one(struct netlink_callback *cb, static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); @@ -140,17 +140,13 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; - struct nlattr *bc; if (IS_ERR(hashinfo)) return; - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -174,7 +170,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) + if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) goto out_unlock; next: num++; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 4ed6b93527f4..d83efd91f461 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -320,11 +320,9 @@ static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, u32 idiag_states = r->idiag_states; struct inet_hashinfo *hashinfo; int i, num, s_i, s_num; - struct nlattr *bc; struct sock *sk; hashinfo = net->ipv4.tcp_death_row.hashinfo; - bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; @@ -365,7 +363,7 @@ static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, r->id.idiag_sport) goto next_listen; - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb_data, sk)) goto next_listen; if (inet_sk_diag_fill(sk, inet_csk(sk), skb, @@ -432,7 +430,7 @@ resume_bind_walk: r->sdiag_family != sk->sk_family) goto next_bind; - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb_data, sk)) goto next_bind; sock_hold(sk); @@ -519,7 +517,7 @@ next_chunk: goto next_normal; twsk_build_assert(); - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb_data, sk)) goto next_normal; if (!refcount_inc_not_zero(&sk->sk_refcnt)) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 38cb3a28e4ed..6e491c720c90 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -16,9 +16,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, @@ -92,12 +92,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; - struct nlattr *bc; - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -130,7 +126,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { + if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 0566dd793810..ac974299de71 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -15,9 +15,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, @@ -76,9 +76,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba const struct inet_diag_req_v2 *r, bool net_admin) { - struct inet_diag_dump_data *cb_data = cb->data; struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; - struct nlattr *bc = cb_data->inet_diag_nla_bc; struct net *net = sock_net(skb->sk); struct inet_hashinfo *hinfo; int i; @@ -121,7 +119,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto next_listen; - ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + ret = sk_diag_dump(sk, skb, cb, r, net_admin); sock_put(sk); @@ -154,15 +152,10 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; struct mptcp_sock *msk; - struct nlattr *bc; BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx)); - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; - while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot, &diag_ctx->s_num)) != NULL) { struct inet_sock *inet = (struct inet_sock *)msk; @@ -181,7 +174,7 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, r->id.idiag_dport) goto next; - ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + ret = sk_diag_dump(sk, skb, cb, r, net_admin); next: sock_put(sk); if (ret < 0) { -- cgit v1.2.3 From 95fa78830e5b2eb2041174c7f9549c746e003dd6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 28 Aug 2025 10:27:38 +0000 Subject: inet_diag: avoid cache line misses in inet_diag_bc_sk() inet_diag_bc_sk() pulls five cache lines per socket, while most filters only need the two first ones. Add three booleans to struct inet_diag_dump_data, that are selectively set if a filter needs specific socket fields. - mark_needed /* INET_DIAG_BC_MARK_COND present. */ - cgroup_needed /* INET_DIAG_BC_CGROUP_COND present. */ - userlocks_needed /* INET_DIAG_BC_AUTO present. */ This removes millions of cache lines misses per ss invocation when simple filters are specified on busy servers. offsetof(struct sock, sk_userlocks) = 0xf3 offsetof(struct sock, sk_mark) = 0x20c offsetof(struct sock, sk_cgrp_data) = 0x298 Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250828102738.2065992-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/inet_diag.h | 5 +++++ net/ipv4/inet_diag.c | 52 ++++++++++++++++++++++++++++------------------- 2 files changed, 36 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 86a0641ec36e..704fd415c2b4 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -38,6 +38,11 @@ struct inet_diag_dump_data { #define inet_diag_nla_bpf_stgs req_nlas[INET_DIAG_REQ_SK_BPF_STORAGES] struct bpf_sk_storage_diag *bpf_stg_diag; + bool mark_needed; /* INET_DIAG_BC_MARK_COND present. */ +#ifdef CONFIG_SOCK_CGROUP_DATA + bool cgroup_needed; /* INET_DIAG_BC_CGROUP_COND present. */ +#endif + bool userlocks_needed; /* INET_DIAG_BC_AUTO present. */ }; struct inet_connection_sock; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 117103042687..f0b6c5a411a2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -605,18 +605,22 @@ int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk) entry.sport = READ_ONCE(inet->inet_num); entry.dport = ntohs(READ_ONCE(inet->inet_dport)); entry.ifindex = READ_ONCE(sk->sk_bound_dev_if); - entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0; - if (sk_fullsock(sk)) - entry.mark = READ_ONCE(sk->sk_mark); - else if (sk->sk_state == TCP_NEW_SYN_RECV) - entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; - else if (sk->sk_state == TCP_TIME_WAIT) - entry.mark = inet_twsk(sk)->tw_mark; - else - entry.mark = 0; + if (cb_data->userlocks_needed) + entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0; + if (cb_data->mark_needed) { + if (sk_fullsock(sk)) + entry.mark = READ_ONCE(sk->sk_mark); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else if (sk->sk_state == TCP_TIME_WAIT) + entry.mark = inet_twsk(sk)->tw_mark; + else + entry.mark = 0; + } #ifdef CONFIG_SOCK_CGROUP_DATA - entry.cgroup_id = sk_fullsock(sk) ? - cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; + if (cb_data->cgroup_needed) + entry.cgroup_id = sk_fullsock(sk) ? + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; #endif return inet_diag_bc_run(bc, &entry); @@ -716,16 +720,21 @@ static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, } #endif -static int inet_diag_bc_audit(const struct nlattr *attr, +static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data, const struct sk_buff *skb) { - bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); + const struct nlattr *attr = cb_data->inet_diag_nla_bc; const void *bytecode, *bc; int bytecode_len, len; + bool net_admin; + + if (!attr) + return 0; - if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) + if (nla_len(attr) < sizeof(struct inet_diag_bc_op)) return -EINVAL; + net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); bytecode = bc = nla_data(attr); len = bytecode_len = nla_len(attr); @@ -757,14 +766,18 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return -EPERM; if (!valid_markcond(bc, len, &min_len)) return -EINVAL; + cb_data->mark_needed = true; break; #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: if (!valid_cgroupcond(bc, len, &min_len)) return -EINVAL; + cb_data->cgroup_needed = true; break; #endif case INET_DIAG_BC_AUTO: + cb_data->userlocks_needed = true; + fallthrough; case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; @@ -841,13 +854,10 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) kfree(cb_data); return err; } - nla = cb_data->inet_diag_nla_bc; - if (nla) { - err = inet_diag_bc_audit(nla, skb); - if (err) { - kfree(cb_data); - return err; - } + err = inet_diag_bc_audit(cb_data, skb); + if (err) { + kfree(cb_data); + return err; } nla = cb_data->inet_diag_nla_bpf_stgs; -- cgit v1.2.3 From a59076f2669ec23a122549e1f4114e8d4255b632 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:57 -0700 Subject: lsm: security_lsmblob_to_secctx module selection Add a parameter lsmid to security_lsmblob_to_secctx() to identify which of the security modules that may be active should provide the security context. If the value of lsmid is LSM_ID_UNDEF the first LSM providing a hook is used. security_secid_to_secctx() is unchanged, and will always report the first LSM providing a hook. Signed-off-by: Casey Schaufler [PM: subj tweak] Signed-off-by: Paul Moore --- include/linux/security.h | 6 ++++-- kernel/audit.c | 4 ++-- kernel/auditsc.c | 8 +++++--- net/netlabel/netlabel_user.c | 3 ++- security/security.c | 18 ++++++++++++++++-- 5 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b9717..6d1ed6e7387b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -567,7 +567,8 @@ int security_getprocattr(struct task_struct *p, int lsmid, const char *name, int security_setprocattr(int lsmid, const char *name, void *value, size_t size); int security_ismaclabel(const char *name); int security_secid_to_secctx(u32 secid, struct lsm_context *cp); -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp); +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid); int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid); void security_release_secctx(struct lsm_context *cp); void security_inode_invalidate_secctx(struct inode *inode); @@ -1551,7 +1552,8 @@ static inline int security_secid_to_secctx(u32 secid, struct lsm_context *cp) } static inline int security_lsmprop_to_secctx(struct lsm_prop *prop, - struct lsm_context *cp) + struct lsm_context *cp, + int lsmid) { return -EOPNOTSUPP; } diff --git a/kernel/audit.c b/kernel/audit.c index 547967cb4266..226c8ae00d04 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1473,7 +1473,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, case AUDIT_SIGNAL_INFO: if (lsmprop_is_set(&audit_sig_lsm)) { err = security_lsmprop_to_secctx(&audit_sig_lsm, - &lsmctx); + &lsmctx, LSM_ID_UNDEF); if (err < 0) return err; } @@ -2188,7 +2188,7 @@ int audit_log_task_context(struct audit_buffer *ab) if (!lsmprop_is_set(&prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx); + error = security_lsmprop_to_secctx(&prop, &ctx, LSM_ID_UNDEF); if (error < 0) { if (error != -EINVAL) goto error_path; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8ec768e2c1e5..3b606fd4ae8e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1109,7 +1109,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx) < 0) { + if (security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF) < 0) { audit_log_format(ab, " obj=(none)"); rc = 1; } else { @@ -1395,7 +1395,8 @@ static void show_special(struct audit_context *context, int *call_panic) struct lsm_context lsmctx; if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx) < 0) { + &lsmctx, + LSM_ID_UNDEF) < 0) { *call_panic = 1; } else { audit_log_format(ab, " obj=%s", lsmctx.context); @@ -1560,7 +1561,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, if (lsmprop_is_set(&n->oprop)) { struct lsm_context ctx; - if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) { + if (security_lsmprop_to_secctx(&n->oprop, &ctx, + LSM_ID_UNDEF) < 0) { if (call_panic) *call_panic = 2; } else { diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 0d04d23aafe7..6d6545297ee3 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -98,7 +98,8 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_info->sessionid); if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx) > 0) { + security_lsmprop_to_secctx(&audit_info->prop, &ctx, + LSM_ID_UNDEF) > 0) { audit_log_format(audit_buf, " subj=%s", ctx.context); security_release_secctx(&ctx); } diff --git a/security/security.c b/security/security.c index ad163f06bf7a..dd588f548a2b 100644 --- a/security/security.c +++ b/security/security.c @@ -4342,17 +4342,31 @@ EXPORT_SYMBOL(security_secid_to_secctx); * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx * @prop: lsm specific information * @cp: the LSM context + * @lsmid: which security module to report * * Convert a @prop entry to security context. If @cp is NULL the * length of the result will be returned. This does mean that the * length could change between calls to check the length and the * next call which actually allocates and returns the @cp. * + * @lsmid identifies which LSM should supply the context. + * A value of LSM_ID_UNDEF indicates that the first LSM suppling + * the hook should be used. This is used in cases where the + * ID of the supplying LSM is unambiguous. + * * Return: Return length of data on success, error on failure. */ -int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp) +int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp, + int lsmid) { - return call_int_hook(lsmprop_to_secctx, prop, cp); + struct lsm_static_call *scall; + + lsm_for_each_hook(scall, lsmprop_to_secctx) { + if (lsmid != LSM_ID_UNDEF && lsmid != scall->hl->lsmid->id) + continue; + return scall->hl->hook.lsmprop_to_secctx(prop, cp); + } + return LSM_RET_DEFAULT(lsmprop_to_secctx); } EXPORT_SYMBOL(security_lsmprop_to_secctx); -- cgit v1.2.3 From eb59d494eebd4c5414728a35cdea6a0ba78ff26e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:58 -0700 Subject: audit: add record for multiple task security contexts Replace the single skb pointer in an audit_buffer with a list of skb pointers. Add the audit_stamp information to the audit_buffer as there's no guarantee that there will be an audit_context containing the stamp associated with the event. At audit_log_end() time create auxiliary records as have been added to the list. Functions are created to manage the skb list in the audit_buffer. Create a new audit record AUDIT_MAC_TASK_CONTEXTS. An example of the MAC_TASK_CONTEXTS record is: type=MAC_TASK_CONTEXTS msg=audit(1600880931.832:113) subj_apparmor=unconfined subj_smack=_ When an audit event includes a AUDIT_MAC_TASK_CONTEXTS record the "subj=" field in other records in the event will be "subj=?". An AUDIT_MAC_TASK_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on a subject security context. Refactor audit_log_task_context(), creating a new audit_log_subj_ctx(). This is used in netlabel auditing to provide multiple subject security contexts as necessary. Suggested-by: Paul Moore Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 16 ++++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 208 ++++++++++++++++++++++++++++++++++++------- net/netlabel/netlabel_user.c | 9 +- security/apparmor/lsm.c | 3 + security/selinux/hooks.c | 3 + security/smack/smack_lsm.c | 3 + 7 files changed, 202 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index e3f06eba9c6e..a1f068bcb3a0 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -37,6 +37,8 @@ struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; +struct lsm_id; +struct lsm_prop; struct audit_krule { u32 pflags; @@ -147,6 +149,9 @@ extern unsigned compat_signal_class[]; #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) +/* bit values for audit_cfg_lsm */ +#define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) + struct filename; #define AUDIT_OFF 0 @@ -185,6 +190,7 @@ extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); +extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -210,6 +216,8 @@ extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags); + #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, @@ -245,6 +253,11 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } +static inline int audit_log_subj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; @@ -269,6 +282,9 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } +static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ } + #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9a4ecc9f6dc5..8cad2f307719 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -148,6 +148,7 @@ #define AUDIT_IPE_POLICY_LOAD 1422 /* IPE policy load */ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ +#define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index 226c8ae00d04..c924b30f2524 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,11 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; +/* Number of modules that provide a security context. + List of lsms that provide a security context */ +static u32 audit_subj_secctx_cnt; +static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; + /** * struct audit_net - audit private network namespace data * @sk: communication socket @@ -195,8 +201,10 @@ static struct audit_ctl_mutex { * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct sk_buff *skb; /* formatted skb ready to send */ + struct sk_buff *skb; /* the skb for audit_log functions */ + struct sk_buff_head skb_list; /* formatted skbs, ready to send */ struct audit_context *ctx; /* NULL or associated context */ + struct audit_stamp stamp; /* audit stamp for these records */ gfp_t gfp_mask; }; @@ -278,6 +286,27 @@ static pid_t auditd_pid_vnr(void) return pid; } +/** + * audit_cfg_lsm - Identify a security module as providing a secctx. + * @lsmid: LSM identity + * @flags: which contexts are provided + * + * Description: + * Increments the count of the security modules providing a secctx. + * If the LSM id is already in the list leave it alone. + */ +void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ + int i; + + if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) { + for (i = 0 ; i < audit_subj_secctx_cnt; i++) + if (audit_subj_lsms[i] == lsmid) + return; + audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; + } +} + /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace @@ -1776,10 +1805,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { + struct sk_buff *skb; + if (!ab) return; - kfree_skb(ab->skb); + while ((skb = skb_dequeue(&ab->skb_list))) + kfree_skb(skb); kmem_cache_free(audit_buffer_cache, ab); } @@ -1795,6 +1827,10 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + + skb_queue_head_init(&ab->skb_list); + skb_queue_tail(&ab->skb_list, ab->skb); + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; @@ -1860,7 +1896,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct audit_stamp stamp; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1915,14 +1950,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_get_stamp(ab->ctx, &stamp); + audit_get_stamp(ab->ctx, &ab->stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", - (unsigned long long)stamp.ctime.tv_sec, - stamp.ctime.tv_nsec/1000000, - stamp.serial); + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); return ab; } @@ -2178,31 +2213,128 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -int audit_log_task_context(struct audit_buffer *ab) +/** + * audit_buffer_aux_new - Add an aux record buffer to the skb list + * @ab: audit_buffer + * @type: message type + * + * Aux records are allocated and added to the skb list of + * the "main" record. The ab->skb is reset to point to the + * aux record on its creation. When the aux record in complete + * ab->skb has to be reset to point to the "main" record. + * This allows the audit_log_ functions to be ignorant of + * which kind of record it is logging to. It also avoids adding + * special data for aux records. + * + * On success ab->skb will point to the new aux record. + * Returns 0 on success, -ENOMEM should allocation fail. + */ +static int audit_buffer_aux_new(struct audit_buffer *ab, int type) +{ + WARN_ON(ab->skb != skb_peek(&ab->skb_list)); + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask); + if (!ab->skb) + goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; + skb_queue_tail(&ab->skb_list, ab->skb); + + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); + + return 0; + +err: + kfree_skb(ab->skb); + ab->skb = skb_peek(&ab->skb_list); + return -ENOMEM; +} + +/** + * audit_buffer_aux_end - Switch back to the "main" record from an aux record + * @ab: audit_buffer + * + * Restores the "main" audit record to ab->skb. + */ +static void audit_buffer_aux_end(struct audit_buffer *ab) +{ + ab->skb = skb_peek(&ab->skb_list); +} + +/** + * audit_log_subj_ctx - Add LSM subject information + * @ab: audit_buffer + * @prop: LSM subject properties. + * + * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record. + */ +int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { - struct lsm_prop prop; struct lsm_context ctx; + char *space = ""; int error; + int i; - security_current_getlsmprop_subj(&prop); - if (!lsmprop_is_set(&prop)) + security_current_getlsmprop_subj(prop); + if (!lsmprop_is_set(prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx, LSM_ID_UNDEF); - if (error < 0) { - if (error != -EINVAL) - goto error_path; + if (audit_subj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return 0; + } + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; } - - audit_log_format(ab, " subj=%s", ctx.context); - security_release_secctx(&ctx); + /* Multiple LSMs provide contexts. Include an aux record. */ + audit_log_format(ab, " subj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_subj_secctx_cnt; i++) { + error = security_lsmprop_to_secctx(prop, &ctx, + audit_subj_lsms[i]->id); + if (error < 0) { + /* + * Don't print anything. An LSM like BPF could + * claim to support contexts, but only do so under + * certain conditions. + */ + if (error == -EOPNOTSUPP) + continue; + if (error != -EINVAL) + audit_panic("error in audit_log_subj_ctx"); + } else { + audit_log_format(ab, "%ssubj_%s=%s", space, + audit_subj_lsms[i]->name, ctx.context); + space = " "; + security_release_secctx(&ctx); + } + } + audit_buffer_aux_end(ab); return 0; error_path: - audit_panic("error in audit_log_task_context"); + audit_panic("error in audit_log_subj_ctx"); return error; } +EXPORT_SYMBOL(audit_log_subj_ctx); + +int audit_log_task_context(struct audit_buffer *ab) +{ + struct lsm_prop prop; + + security_current_getlsmprop_subj(&prop); + return audit_log_subj_ctx(ab, &prop); +} EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, @@ -2411,6 +2543,26 @@ int audit_signal_info(int sig, struct task_struct *t) return audit_signal_info_syscall(t); } +/** + * __audit_log_end - enqueue one audit record + * @skb: the buffer to send + */ +static void __audit_log_end(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + + if (audit_rate_check()) { + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet */ + skb_queue_tail(&audit_queue, skb); + } else + audit_log_lost("rate limit exceeded"); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer @@ -2423,25 +2575,15 @@ int audit_signal_info(int sig, struct task_struct *t) void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; - struct nlmsghdr *nlh; if (!ab) return; - if (audit_rate_check()) { - skb = ab->skb; - ab->skb = NULL; + while ((skb = skb_dequeue(&ab->skb_list))) + __audit_log_end(skb); - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; - - /* queue the netlink packet and poke the kauditd thread */ - skb_queue_tail(&audit_queue, skb); - wake_up_interruptible(&kauditd_wait); - } else - audit_log_lost("rate limit exceeded"); + /* poke the kauditd thread */ + wake_up_interruptible(&kauditd_wait); audit_buffer_free(ab); } diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 6d6545297ee3..0da652844dd6 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -84,7 +84,6 @@ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; - struct lsm_context ctx; if (audit_enabled == AUDIT_OFF) return NULL; @@ -96,13 +95,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); - - if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx, - LSM_ID_UNDEF) > 0) { - audit_log_format(audit_buf, " subj=%s", ctx.context); - security_release_secctx(&ctx); - } + audit_log_subj_ctx(audit_buf, &audit_info->prop); return audit_buf; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b..220d1684b8d4 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -2530,6 +2530,9 @@ static int __init apparmor_init(void) security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&apparmor_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..975b84b466b4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7618,6 +7618,9 @@ static __init int selinux_init(void) /* Set the security state for the initial task. */ cred_init_security(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fc340a6f0dde..eaff9b8901a7 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5267,6 +5267,9 @@ static __init int smack_init(void) /* initialize the smack_known_list */ init_smack_known_list(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + return 0; } -- cgit v1.2.3 From 0ffbc876d03c80b83d70aeefac7bbb94a9f4e135 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:59 -0700 Subject: audit: add record for multiple object contexts Create a new audit record AUDIT_MAC_OBJ_CONTEXTS. An example of the MAC_OBJ_CONTEXTS record is: type=MAC_OBJ_CONTEXTS msg=audit(1601152467.009:1050): obj_selinux=unconfined_u:object_r:user_home_t:s0 When an audit event includes a AUDIT_MAC_OBJ_CONTEXTS record the "obj=" field in other records in the event will be "obj=?". An AUDIT_MAC_OBJ_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on an object security context. Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 7 ++++++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 58 +++++++++++++++++++++++++++++++++++++++++++++- kernel/auditsc.c | 38 ++++++------------------------ security/selinux/hooks.c | 4 +++- security/smack/smack_lsm.c | 4 +++- 6 files changed, 78 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index a1f068bcb3a0..536f8ee8da81 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -151,6 +151,7 @@ extern unsigned compat_signal_class[]; /* bit values for audit_cfg_lsm */ #define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) +#define AUDIT_CFG_LSM_SECCTX_OBJECT BIT(1) struct filename; @@ -191,6 +192,7 @@ extern void audit_log_path_denied(int type, extern void audit_log_lost(const char *message); extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); +extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -258,6 +260,11 @@ static inline int audit_log_subj_ctx(struct audit_buffer *ab, { return 0; } +static inline int audit_log_obj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 8cad2f307719..14a1c1fe013a 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -149,6 +149,7 @@ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ #define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ +#define AUDIT_MAC_OBJ_CONTEXTS 1426 /* Multiple LSM objext contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index c924b30f2524..bd7474fd8d2c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -85,7 +85,9 @@ static unsigned int audit_net_id; /* Number of modules that provide a security context. List of lsms that provide a security context */ static u32 audit_subj_secctx_cnt; +static u32 audit_obj_secctx_cnt; static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; +static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT]; /** * struct audit_net - audit private network namespace data @@ -305,6 +307,12 @@ void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) return; audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; } + if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) { + for (i = 0 ; i < audit_obj_secctx_cnt; i++) + if (audit_obj_lsms[i] == lsmid) + return; + audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid; + } } /** @@ -1142,7 +1150,6 @@ static int is_audit_feature_set(int i) return af.features & AUDIT_FEATURE_TO_MASK(i); } - static int audit_get_feature(struct sk_buff *skb) { u32 seq; @@ -2337,6 +2344,55 @@ int audit_log_task_context(struct audit_buffer *ab) } EXPORT_SYMBOL(audit_log_task_context); +int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) +{ + int i; + int rc; + int error = 0; + char *space = ""; + struct lsm_context ctx; + + if (audit_obj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return error; + } + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); + return 0; + } + audit_log_format(ab, " obj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_obj_secctx_cnt; i++) { + rc = security_lsmprop_to_secctx(prop, &ctx, + audit_obj_lsms[i]->id); + if (rc < 0) { + audit_log_format(ab, "%sobj_%s=?", space, + audit_obj_lsms[i]->name); + if (rc != -EINVAL) + audit_panic("error in audit_log_obj_ctx"); + error = rc; + } else { + audit_log_format(ab, "%sobj_%s=%s", space, + audit_obj_lsms[i]->name, ctx.context); + security_release_secctx(&ctx); + } + space = " "; + } + + audit_buffer_aux_end(ab); + return error; + +error_path: + audit_panic("error in audit_log_obj_ctx"); + return error; +} + void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3b606fd4ae8e..d1966144bdfe 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, char *comm) { struct audit_buffer *ab; - struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); @@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF) < 0) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop)) + rc = 1; + audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); @@ -1392,16 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic) from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { - struct lsm_context lsmctx; - - if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx, - LSM_ID_UNDEF) < 0) { + if (audit_log_obj_ctx(ab, &context->ipc.oprop)) *call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", lsmctx.context); - security_release_secctx(&lsmctx); - } } if (context->ipc.has_perm) { audit_log_end(ab); @@ -1558,18 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - if (lsmprop_is_set(&n->oprop)) { - struct lsm_context ctx; - - if (security_lsmprop_to_secctx(&n->oprop, &ctx, - LSM_ID_UNDEF) < 0) { - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(&n->oprop) && + audit_log_obj_ctx(ab, &n->oprop)) + *call_panic = 2; /* log the audit_names record type */ switch (n->type) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 975b84b466b4..3999f58a1842 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7619,7 +7619,9 @@ static __init int selinux_init(void) cred_init_security(); /* Inform the audit system that secctx is used */ - audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + audit_cfg_lsm(&selinux_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index eaff9b8901a7..fdf2f193a291 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5268,7 +5268,9 @@ static __init int smack_init(void) init_smack_known_list(); /* Inform the audit system that secctx is used */ - audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + audit_cfg_lsm(&smack_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); return 0; } -- cgit v1.2.3 From 1d8fdabe19267338f29b58f968499e5b55e6a3b6 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Fri, 29 Aug 2025 12:25:43 +0100 Subject: iio: frequency: adf4350: Fix ADF4350_REG3_12BIT_CLKDIV_MODE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The clk div bits (2 bits wide) do not start in bit 16 but in bit 15. Fix it accordingly. Fixes: e31166f0fd48 ("iio: frequency: New driver for Analog Devices ADF4350/ADF4351 Wideband Synthesizers") Signed-off-by: Michael Hennerich Signed-off-by: Nuno Sá Link: https://patch.msgid.link/20250829-adf4350-fix-v2-2-0bf543ba797d@analog.com Cc: Signed-off-by: Jonathan Cameron --- include/linux/iio/frequency/adf4350.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/frequency/adf4350.h b/include/linux/iio/frequency/adf4350.h index de45cf2ee1e4..ce2086f97e3f 100644 --- a/include/linux/iio/frequency/adf4350.h +++ b/include/linux/iio/frequency/adf4350.h @@ -51,7 +51,7 @@ /* REG3 Bit Definitions */ #define ADF4350_REG3_12BIT_CLKDIV(x) ((x) << 3) -#define ADF4350_REG3_12BIT_CLKDIV_MODE(x) ((x) << 16) +#define ADF4350_REG3_12BIT_CLKDIV_MODE(x) ((x) << 15) #define ADF4350_REG3_12BIT_CSR_EN (1 << 18) #define ADF4351_REG3_CHARGE_CANCELLATION_EN (1 << 21) #define ADF4351_REG3_ANTI_BACKLASH_3ns_EN (1 << 22) -- cgit v1.2.3 From 37b27bd5d6217b75d315f28b4399aad0a336f299 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 26 Aug 2025 11:39:02 -0400 Subject: fs: add an icount_read helper Instead of doing direct access to ->i_count, add a helper to handle this. This will make it easier to convert i_count to a refcount later. Signed-off-by: Josef Bacik Link: https://lore.kernel.org/9bc62a84c6b9d6337781203f60837bd98fbc4a96.1756222464.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- arch/powerpc/platforms/cell/spufs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/mds_client.c | 2 +- fs/ext4/ialloc.c | 4 ++-- fs/fs-writeback.c | 2 +- fs/hpfs/inode.c | 2 +- fs/inode.c | 8 ++++---- fs/nfs/inode.c | 4 ++-- fs/notify/fsnotify.c | 2 +- fs/smb/client/inode.c | 2 +- fs/ubifs/super.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_trace.h | 2 +- include/linux/fs.h | 5 +++++ include/trace/events/filelock.h | 2 +- security/landlock/fs.c | 2 +- 16 files changed, 25 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index d5a2c77bc908..ce839783c0df 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1430,7 +1430,7 @@ static int spufs_mfc_open(struct inode *inode, struct file *file) if (ctx->owner != current->mm) return -EINVAL; - if (atomic_read(&inode->i_count) != 1) + if (icount_read(inode) != 1) return -EBUSY; mutex_lock(&ctx->mapping_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index de722b232ec1..5bcd8e25fa78 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4538,7 +4538,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) inode = btrfs_find_first_inode(root, min_ino); while (inode) { - if (atomic_read(&inode->vfs_inode.i_count) > 1) + if (icount_read(&inode->vfs_inode) > 1) d_prune_aliases(&inode->vfs_inode); min_ino = btrfs_ino(inode) + 1; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0f497c39ff82..62dba710504d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2221,7 +2221,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) int count; dput(dentry); d_prune_aliases(inode); - count = atomic_read(&inode->i_count); + count = icount_read(inode); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df4051613b29..ba4fd9aba1c1 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) "nonexistent device\n", __func__, __LINE__); return; } - if (atomic_read(&inode->i_count) > 1) { + if (icount_read(inode) > 1) { ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", __func__, __LINE__, inode->i_ino, - atomic_read(&inode->i_count)); + icount_read(inode)); return; } if (inode->i_nlink) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641..6088a67b2aae 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1767,7 +1767,7 @@ static int writeback_single_inode(struct inode *inode, int ret = 0; spin_lock(&inode->i_lock); - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index a59e8fa630db..34008442ee26 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i) struct hpfs_inode_info *hpfs_inode = hpfs_i(i); struct inode *parent; if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return; - if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) { + if (hpfs_inode->i_rddir_off && !icount_read(i)) { if (*hpfs_inode->i_rddir_off) pr_err("write_inode: some position still there\n"); kfree(hpfs_inode->i_rddir_off); diff --git a/fs/inode.c b/fs/inode.c index 01a554e11279..fe4868e2a954 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -534,7 +534,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate) { if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) return; - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) return; if (!(inode->i_sb->s_flags & SB_ACTIVE)) return; @@ -871,11 +871,11 @@ void evict_inodes(struct super_block *sb) again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (atomic_read(&inode->i_count)) + if (icount_read(inode)) continue; spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_count)) { + if (icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } @@ -937,7 +937,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, * unreclaimable for a while. Remove them lazily here; iput, * sync, or the last page cache deletion will requeue them. */ - if (atomic_read(&inode->i_count) || + if (icount_read(inode) || (inode->i_state & ~I_REFERENCED) || !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae423..b52805951856 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), - atomic_read(&inode->i_count)); + icount_read(inode)); out: return inode; @@ -2229,7 +2229,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), - atomic_read(&inode->i_count), fattr->valid); + icount_read(inode), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 079b868552c2..46bfc543f946 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -66,7 +66,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * removed all zero refcount inodes, in any case. Test to * be sure. */ - if (!atomic_read(&inode->i_count)) { + if (!icount_read(inode)) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 75be4b46bc6f..211d5b8b42f4 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2749,7 +2749,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) } cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n", - full_path, inode, inode->i_count.counter, + full_path, inode, icount_read(inode), dentry, cifs_get_time(dentry), jiffies); again: diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b2068608..a0269ba96e3d 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode) goto out; dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); - ubifs_assert(c, !atomic_read(&inode->i_count)); + ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9c39251961a3..df8eab11dc48 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1035,7 +1035,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (atomic_read(&VFS_I(ip)->i_count)) + if (icount_read(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e1794e3e3156..34001503fc8b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1151,7 +1151,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->count = icount_read(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; diff --git a/include/linux/fs.h b/include/linux/fs.h index c34554d8c4fe..c4fd010cf5bf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2611,6 +2611,11 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } +static inline int icount_read(const struct inode *inode) +{ + return atomic_read(&inode->i_count); +} + /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index b8d1e00a7982..fdd36b1daa25 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -189,7 +189,7 @@ TRACE_EVENT(generic_add_lease, __entry->i_ino = inode->i_ino; __entry->wcount = atomic_read(&inode->i_writecount); __entry->rcount = atomic_read(&inode->i_readcount); - __entry->icount = atomic_read(&inode->i_count); + __entry->icount = icount_read(inode); __entry->owner = fl->c.flc_owner; __entry->flags = fl->c.flc_flags; __entry->type = fl->c.flc_type; diff --git a/security/landlock/fs.c b/security/landlock/fs.c index c04f8879ad03..0bade2c5aa1d 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1281,7 +1281,7 @@ static void hook_sb_delete(struct super_block *const sb) struct landlock_object *object; /* Only handles referenced inodes. */ - if (!atomic_read(&inode->i_count)) + if (!icount_read(inode)) continue; /* -- cgit v1.2.3 From e5bca063c150877c45b88ff134b6ef7a5eae8e7a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 30 Aug 2025 12:55:39 +0200 Subject: fs: remove vfs_ioctl export vfs_ioctl() is no longer called by anything outside of fs/ioctl.c, so remove the global symbol and export as it is not needed. Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/2025083038-carving-amuck-a4ae@gregkh Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/ioctl.c | 3 +-- include/linux/fs.h | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/ioctl.c b/fs/ioctl.c index 83d07218b6cd..1c152c2b1b67 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) out: return error; } -EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 34693cae15a2..4daf9b30a641 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2053,8 +2053,6 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); - #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -- cgit v1.2.3 From edd3cb05c00a040dc72bed20b14b5ba865188bce Mon Sep 17 00:00:00 2001 From: Simon Schuster Date: Mon, 1 Sep 2025 15:09:51 +0200 Subject: copy_process: pass clone_flags as u64 across calltree With the introduction of clone3 in commit 7f192e3cd316 ("fork: add clone3") the effective bit width of clone_flags on all architectures was increased from 32-bit to 64-bit, with a new type of u64 for the flags. However, for most consumers of clone_flags the interface was not changed from the previous type of unsigned long. While this works fine as long as none of the new 64-bit flag bits (CLONE_CLEAR_SIGHAND and CLONE_INTO_CGROUP) are evaluated, this is still undesirable in terms of the principle of least surprise. Thus, this commit fixes all relevant interfaces of callees to sys_clone3/copy_process (excluding the architecture-specific copy_thread) to consistently pass clone_flags as u64, so that no truncation to 32-bit integers occurs on 32-bit architectures. Signed-off-by: Simon Schuster Link: https://lore.kernel.org/20250901-nios2-implement-clone3-v2-2-53fcf5577d57@siemens-energy.com Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Signed-off-by: Christian Brauner --- block/blk-ioc.c | 2 +- fs/namespace.c | 2 +- include/linux/cgroup.h | 4 ++-- include/linux/cred.h | 2 +- include/linux/iocontext.h | 6 +++--- include/linux/ipc_namespace.h | 4 ++-- include/linux/lsm_hook_defs.h | 2 +- include/linux/mnt_namespace.h | 2 +- include/linux/nsproxy.h | 2 +- include/linux/pid_namespace.h | 4 ++-- include/linux/rseq.h | 4 ++-- include/linux/sched/task.h | 2 +- include/linux/security.h | 4 ++-- include/linux/sem.h | 4 ++-- include/linux/time_namespace.h | 4 ++-- include/linux/uprobes.h | 4 ++-- include/linux/user_events.h | 4 ++-- include/linux/utsname.h | 4 ++-- include/net/net_namespace.h | 4 ++-- include/trace/events/task.h | 6 +++--- ipc/namespace.c | 2 +- ipc/sem.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/cred.c | 2 +- kernel/events/uprobes.c | 2 +- kernel/fork.c | 8 ++++---- kernel/nsproxy.c | 4 ++-- kernel/pid_namespace.c | 2 +- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 4 ++-- kernel/time/namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- security/apparmor/lsm.c | 2 +- security/security.c | 2 +- security/selinux/hooks.c | 2 +- security/tomoyo/tomoyo.c | 2 +- 38 files changed, 59 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9fda3906e5f5..d15918d7fabb 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -286,7 +286,7 @@ out: } EXPORT_SYMBOL_GPL(set_task_ioprio); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d338..d9c190ffa7df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4200,7 +4200,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } __latent_entropy -struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, +struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..56d9556a181a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -796,7 +796,7 @@ extern struct cgroup_namespace init_cgroup_ns; void free_cgroup_ns(struct cgroup_namespace *ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); @@ -818,7 +818,7 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, +copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; diff --git a/include/linux/cred.h b/include/linux/cred.h index a102a10f833f..89ae50ad2ace 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -148,7 +148,7 @@ struct cred { extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); -extern int copy_creds(struct task_struct *, unsigned long); +extern int copy_creds(struct task_struct *, u64); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 14f7eaf1b443..079d8773790c 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -118,8 +118,8 @@ struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); -int __copy_io(unsigned long clone_flags, struct task_struct *tsk); -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +int __copy_io(u64 clone_flags, struct task_struct *tsk); +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { if (!current->io_context) return 0; @@ -129,7 +129,7 @@ static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_io(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611a..4b399893e2b3 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,7 +129,7 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) -extern struct ipc_namespace *copy_ipcs(unsigned long flags, +extern struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns); static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) @@ -151,7 +151,7 @@ static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns extern void put_ipc_ns(struct ipc_namespace *ns); #else -static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +static inline struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (flags & CLONE_NEWIPC) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index fd11fffdd3c3..adbe234a6f6c 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -211,7 +211,7 @@ LSM_HOOK(int, 0, file_open, struct file *file) LSM_HOOK(int, 0, file_post_open, struct file *file, int mask) LSM_HOOK(int, 0, file_truncate, struct file *file) LSM_HOOK(int, 0, task_alloc, struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) LSM_HOOK(void, LSM_RET_VOID, task_free, struct task_struct *task) LSM_HOOK(int, 0, cred_alloc_blank, struct cred *cred, gfp_t gfp) LSM_HOOK(void, LSM_RET_VOID, cred_free, struct cred *cred) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b64816..ff290c87b2e7 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,7 +11,7 @@ struct fs_struct; struct user_namespace; struct ns_common; -extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, +extern struct mnt_namespace *copy_mnt_ns(u64, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T)) diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a22..82533e899ff4 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -103,7 +103,7 @@ static inline struct cred *nsset_cred(struct nsset *set) * */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk); +int copy_namespaces(u64 flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..0620a3e08e83 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -78,7 +78,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) } #endif -extern struct pid_namespace *copy_pid_ns(unsigned long flags, +extern struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); @@ -97,7 +97,7 @@ static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) return 0; } -static inline struct pid_namespace *copy_pid_ns(unsigned long flags, +static inline struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *ns) { if (flags & CLONE_NEWPID) diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bc8af3eb5598..a96fd345aa38 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -65,7 +65,7 @@ static inline void rseq_migrate(struct task_struct *t) * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; @@ -107,7 +107,7 @@ static inline void rseq_preempt(struct task_struct *t) static inline void rseq_migrate(struct task_struct *t) { } -static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ea41795a352b..34d6a0e108c3 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,7 @@ extern int lockdep_tasklist_lock_is_held(void); extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); -extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern int sched_fork(u64 clone_flags, struct task_struct *p); extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b9717..9a1d4a6c8673 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -489,7 +489,7 @@ int security_file_receive(struct file *file); int security_file_open(struct file *file); int security_file_post_open(struct file *file, int mask); int security_file_truncate(struct file *file); -int security_task_alloc(struct task_struct *task, unsigned long clone_flags); +int security_task_alloc(struct task_struct *task, u64 clone_flags); void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); @@ -1215,7 +1215,7 @@ static inline int security_file_truncate(struct file *file) } static inline int security_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { return 0; } diff --git a/include/linux/sem.h b/include/linux/sem.h index c4deefe42aeb..275269ce2ec8 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -9,12 +9,12 @@ struct task_struct; #ifdef CONFIG_SYSVIPC -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern int copy_semundo(u64 clone_flags, struct task_struct *tsk); extern void exit_sem(struct task_struct *tsk); #else -static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +static inline int copy_semundo(u64 clone_flags, struct task_struct *tsk) { return 0; } diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc94..b6e36525e0be 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -43,7 +43,7 @@ static inline struct time_namespace *get_time_ns(struct time_namespace *ns) return ns; } -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct time_namespace *ns); @@ -129,7 +129,7 @@ static inline void put_time_ns(struct time_namespace *ns) } static inline -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 516217c39094..915303a82d84 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -205,7 +205,7 @@ extern void uprobe_start_dup_mmap(void); extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); -extern void uprobe_copy_process(struct task_struct *t, unsigned long flags); +extern void uprobe_copy_process(struct task_struct *t, u64 flags); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); @@ -281,7 +281,7 @@ static inline bool uprobe_deny_signal(void) static inline void uprobe_free_utask(struct task_struct *t) { } -static inline void uprobe_copy_process(struct task_struct *t, unsigned long flags) +static inline void uprobe_copy_process(struct task_struct *t, u64 flags) { } static inline void uprobe_clear_state(struct mm_struct *mm) diff --git a/include/linux/user_events.h b/include/linux/user_events.h index 8afa8c3a0973..57d1ff006090 100644 --- a/include/linux/user_events.h +++ b/include/linux/user_events.h @@ -33,7 +33,7 @@ extern void user_event_mm_dup(struct task_struct *t, extern void user_event_mm_remove(struct task_struct *t); static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { struct user_event_mm *old_mm; @@ -68,7 +68,7 @@ static inline void user_events_exit(struct task_struct *t) } #else static inline void user_events_fork(struct task_struct *t, - unsigned long clone_flags) + u64 clone_flags) { } diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412b..ba34ec0e2f95 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -35,7 +35,7 @@ static inline void get_uts_ns(struct uts_namespace *ns) refcount_inc(&ns->ns.count); } -extern struct uts_namespace *copy_utsname(unsigned long flags, +extern struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns); extern void free_uts_ns(struct uts_namespace *ns); @@ -55,7 +55,7 @@ static inline void put_uts_ns(struct uts_namespace *ns) { } -static inline struct uts_namespace *copy_utsname(unsigned long flags, +static inline struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { if (flags & CLONE_NEWUTS) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275..0e008cfe159d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -204,7 +204,7 @@ struct net { extern struct net init_net; #ifdef CONFIG_NET_NS -struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net); void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); @@ -218,7 +218,7 @@ extern struct task_struct *cleanup_net_task; #else /* CONFIG_NET_NS */ #include #include -static inline struct net *copy_net_ns(unsigned long flags, +static inline struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { if (flags & CLONE_NEWNET) diff --git a/include/trace/events/task.h b/include/trace/events/task.h index af535b053033..4f0759634306 100644 --- a/include/trace/events/task.h +++ b/include/trace/events/task.h @@ -8,14 +8,14 @@ TRACE_EVENT(task_newtask, - TP_PROTO(struct task_struct *task, unsigned long clone_flags), + TP_PROTO(struct task_struct *task, u64 clone_flags), TP_ARGS(task, clone_flags), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN) - __field( unsigned long, clone_flags) + __field( u64, clone_flags) __field( short, oom_score_adj) ), @@ -26,7 +26,7 @@ TRACE_EVENT(task_newtask, __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", + TP_printk("pid=%d comm=%s clone_flags=%llx oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->clone_flags, __entry->oom_score_adj) ); diff --git a/ipc/namespace.c b/ipc/namespace.c index 4df91ceeeafe..a712ec27209c 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -106,7 +106,7 @@ fail: return ERR_PTR(err); } -struct ipc_namespace *copy_ipcs(unsigned long flags, +struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) diff --git a/ipc/sem.c b/ipc/sem.c index a39cdc7bf88f..0f06e4bd4673 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2303,7 +2303,7 @@ SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, * parent and child tasks. */ -int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +int copy_semundo(u64 clone_flags, struct task_struct *tsk) { struct sem_undo_list *undo_list; int error; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 144a464e45c6..dedadb525880 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -47,7 +47,7 @@ void free_cgroup_ns(struct cgroup_namespace *ns) } EXPORT_SYMBOL(free_cgroup_ns); -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, +struct cgroup_namespace *copy_cgroup_ns(u64 flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { diff --git a/kernel/cred.c b/kernel/cred.c index 9676965c0981..dbf6b687dc5c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -287,7 +287,7 @@ struct cred *prepare_exec_creds(void) * The new process gets the current process's subjective credentials as its * objective and subjective credentials */ -int copy_creds(struct task_struct *p, unsigned long clone_flags) +int copy_creds(struct task_struct *p, u64 clone_flags) { struct cred *new; int ret; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd..b2753014c6dd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2160,7 +2160,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t, unsigned long flags) +void uprobe_copy_process(struct task_struct *t, u64 flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; diff --git a/kernel/fork.c b/kernel/fork.c index 4e2c5a3e8989..d6e1fb11eff9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1507,7 +1507,7 @@ fail_nomem: return NULL; } -static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) +static int copy_mm(u64 clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; @@ -1545,7 +1545,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +static int copy_fs(u64 clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { @@ -1566,7 +1566,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk, +static int copy_files(u64 clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; @@ -1645,7 +1645,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) posix_cputimers_group_init(pct, cpu_limit); } -static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +static int copy_signal(u64 clone_flags, struct task_struct *tsk) { struct signal_struct *sig; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38..8af3b9ec3aa8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -64,7 +64,7 @@ static inline struct nsproxy *create_nsproxy(void) * Return the newly created nsproxy. Do not attach this to the task, * leave it to the caller to do proper locking and attach it to task. */ -static struct nsproxy *create_new_namespaces(unsigned long flags, +static struct nsproxy *create_new_namespaces(u64 flags, struct task_struct *tsk, struct user_namespace *user_ns, struct fs_struct *new_fs) { @@ -144,7 +144,7 @@ out_ns: * called from clone. This now handles copy for nsproxy and all * namespaces therein. */ -int copy_namespaces(unsigned long flags, struct task_struct *tsk) +int copy_namespaces(u64 flags, struct task_struct *tsk) { struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..06bc7c7f78e0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -171,7 +171,7 @@ static void destroy_pid_namespace_work(struct work_struct *work) } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); } -struct pid_namespace *copy_pid_ns(unsigned long flags, +struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..6fa85d30d965 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4472,7 +4472,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * __sched_fork() is basic setup which is also used by sched_init() to * initialize the boot CPU's idle task. */ -static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +static void __sched_fork(u64 clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -4707,7 +4707,7 @@ late_initcall(sched_core_sysctl_init); /* * fork()/clone()-time setup: */ -int sched_fork(unsigned long clone_flags, struct task_struct *p) +int sched_fork(u64 clone_flags, struct task_struct *p) { __sched_fork(clone_flags, p); /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..af0866ce2dfc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3542,7 +3542,7 @@ out: } } -void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +void init_numa_balancing(u64 clone_flags, struct task_struct *p) { int mm_users = 0; struct mm_struct *mm = p->mm; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f7..f9adfc912ddc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1935,12 +1935,12 @@ extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *p, struct task_struct *t, int cpu, int scpu); -extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +extern void init_numa_balancing(u64 clone_flags, struct task_struct *p); #else /* !CONFIG_NUMA_BALANCING: */ static inline void -init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +init_numa_balancing(u64 clone_flags, struct task_struct *p) { } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 667452768ed3..888872bcc5bb 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -130,7 +130,7 @@ fail: * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) diff --git a/kernel/utsname.c b/kernel/utsname.c index b1ac3ca870f2..00d8d7922f86 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -86,7 +86,7 @@ fail: * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, +struct uts_namespace *copy_utsname(u64 flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e..8ec9d83475bf 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -539,7 +539,7 @@ void net_drop_ns(void *p) net_passive_dec(net); } -struct net *copy_net_ns(unsigned long flags, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b..ba39cfe0cd08 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -112,7 +112,7 @@ static void apparmor_task_free(struct task_struct *task) } static int apparmor_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct aa_task_ctx *new = task_ctx(task); diff --git a/security/security.c b/security/security.c index ad163f06bf7a..a769140553bc 100644 --- a/security/security.c +++ b/security/security.c @@ -3185,7 +3185,7 @@ int security_file_truncate(struct file *file) * * Return: Returns a zero on success, negative values on failure. */ -int security_task_alloc(struct task_struct *task, unsigned long clone_flags) +int security_task_alloc(struct task_struct *task, u64 clone_flags) { int rc = lsm_task_alloc(task); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..bb016dd511c1 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4144,7 +4144,7 @@ static int selinux_file_open(struct file *file) /* task security operations */ static int selinux_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { u32 sid = current_sid(); diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index d6ebcd9db80a..48fc59d38ab2 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -514,7 +514,7 @@ struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = { * Returns 0. */ static int tomoyo_task_alloc(struct task_struct *task, - unsigned long clone_flags) + u64 clone_flags) { struct tomoyo_task *old = tomoyo_task(current); struct tomoyo_task *new = tomoyo_task(task); -- cgit v1.2.3 From d9a3e9929452780df16f3414f0d59b5f69d058cf Mon Sep 17 00:00:00 2001 From: Thomas Andreatta Date: Wed, 27 Aug 2025 17:24:43 +0200 Subject: dmaengine: sh: setup_xref error handling This patch modifies the type of setup_xref from void to int and handles errors since the function can fail. `setup_xref` now returns the (eventual) error from `dmae_set_dmars`|`dmae_set_chcr`, while `shdma_tx_submit` handles the result, removing the chunks from the queue and marking PM as idle in case of an error. Signed-off-by: Thomas Andreatta Link: https://lore.kernel.org/r/20250827152442.90962-1-thomas.andreatta2000@gmail.com Signed-off-by: Vinod Koul --- drivers/dma/sh/shdma-base.c | 25 +++++++++++++++++++------ drivers/dma/sh/shdmac.c | 17 +++++++++++++---- include/linux/shdma-base.h | 2 +- 3 files changed, 33 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma/sh/shdma-base.c b/drivers/dma/sh/shdma-base.c index 6b4fce453c85..834741adadaa 100644 --- a/drivers/dma/sh/shdma-base.c +++ b/drivers/dma/sh/shdma-base.c @@ -129,12 +129,25 @@ static dma_cookie_t shdma_tx_submit(struct dma_async_tx_descriptor *tx) const struct shdma_ops *ops = sdev->ops; dev_dbg(schan->dev, "Bring up channel %d\n", schan->id); - /* - * TODO: .xfer_setup() might fail on some platforms. - * Make it int then, on error remove chunks from the - * queue again - */ - ops->setup_xfer(schan, schan->slave_id); + + ret = ops->setup_xfer(schan, schan->slave_id); + if (ret < 0) { + dev_err(schan->dev, "setup_xfer failed: %d\n", ret); + + /* Remove chunks from the queue and mark them as idle */ + list_for_each_entry_safe(chunk, c, &schan->ld_queue, node) { + if (chunk->cookie == cookie) { + chunk->mark = DESC_IDLE; + list_move(&chunk->node, &schan->ld_free); + } + } + + schan->pm_state = SHDMA_PM_ESTABLISHED; + ret = pm_runtime_put(schan->dev); + + spin_unlock_irq(&schan->chan_lock); + return ret; + } if (schan->pm_state == SHDMA_PM_PENDING) shdma_chan_xfer_ld_queue(schan); diff --git a/drivers/dma/sh/shdmac.c b/drivers/dma/sh/shdmac.c index 093e449e19ee..603e15102e45 100644 --- a/drivers/dma/sh/shdmac.c +++ b/drivers/dma/sh/shdmac.c @@ -300,21 +300,30 @@ static bool sh_dmae_channel_busy(struct shdma_chan *schan) return dmae_is_busy(sh_chan); } -static void sh_dmae_setup_xfer(struct shdma_chan *schan, - int slave_id) +static int sh_dmae_setup_xfer(struct shdma_chan *schan, int slave_id) { struct sh_dmae_chan *sh_chan = container_of(schan, struct sh_dmae_chan, shdma_chan); + int ret = 0; if (slave_id >= 0) { const struct sh_dmae_slave_config *cfg = sh_chan->config; - dmae_set_dmars(sh_chan, cfg->mid_rid); - dmae_set_chcr(sh_chan, cfg->chcr); + ret = dmae_set_dmars(sh_chan, cfg->mid_rid); + if (ret < 0) + goto END; + + ret = dmae_set_chcr(sh_chan, cfg->chcr); + if (ret < 0) + goto END; + } else { dmae_init(sh_chan); } + +END: + return ret; } /* diff --git a/include/linux/shdma-base.h b/include/linux/shdma-base.h index 6dfd05ef5c2d..03ba4dab2ef7 100644 --- a/include/linux/shdma-base.h +++ b/include/linux/shdma-base.h @@ -96,7 +96,7 @@ struct shdma_ops { int (*desc_setup)(struct shdma_chan *, struct shdma_desc *, dma_addr_t, dma_addr_t, size_t *); int (*set_slave)(struct shdma_chan *, int, dma_addr_t, bool); - void (*setup_xfer)(struct shdma_chan *, int); + int (*setup_xfer)(struct shdma_chan *, int); void (*start_xfer)(struct shdma_chan *, struct shdma_desc *); struct shdma_desc *(*embedded_desc)(void *, int); bool (*chan_irq)(struct shdma_chan *, int); -- cgit v1.2.3 From 7ea95d55e63176899eb96f7aaa34a5646f501b2c Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 26 Aug 2025 11:07:38 -0500 Subject: dmaengine: Fix dma_async_tx_descriptor->tx_submit documentation Commit 790fb9956eea ("linux/dmaengine.h: fix a few kernel-doc warnings") inserted new documentation for @desc_free in the middle of @tx_submit's description. Put @tx_submit's description back together, matching the indentation style of the rest of the documentation for dma_async_tx_descriptor. Fixes: 790fb9956eea ("linux/dmaengine.h: fix a few kernel-doc warnings") Reviewed-by: Dave Jiang Signed-off-by: Nathan Lynch Link: https://lore.kernel.org/r/20250826-dma_async_tx_desc-tx_submit-doc-fix-v1-1-18a4b51697db@amd.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 6de7c05d6bd8..99efe2b9b4ea 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -594,9 +594,9 @@ struct dma_descriptor_metadata_ops { * @phys: physical address of the descriptor * @chan: target channel for this operation * @tx_submit: accept the descriptor, assign ordered cookie and mark the + * descriptor pending. To be pushed on .issue_pending() call * @desc_free: driver's callback function to free a resusable descriptor * after completion - * descriptor pending. To be pushed on .issue_pending() call * @callback: routine to call after this operation is complete * @callback_result: error result from a DMA transaction * @callback_param: general parameter to pass to the callback routine -- cgit v1.2.3 From c9f62564252c21d739a5003e9b2d6ad0828aa7bd Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Aug 2025 19:01:11 +0200 Subject: mtd: rawnand: s3c2410: Drop driver (no actual S3C64xx user) The s3c2410 NAND driver still supports S3C64xx platform, which in general is supported in the kernel. There are however no references of "s3c6400-nand" platform device ID or "s3c24xx-nand" driver, thus this driver cannot be instantiated for S3C64xx platform and is basically unused. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Miquel Raynal --- drivers/mtd/nand/raw/Kconfig | 26 - drivers/mtd/nand/raw/Makefile | 1 - drivers/mtd/nand/raw/s3c2410.c | 1192 ------------------------ include/linux/platform_data/mtd-nand-s3c2410.h | 70 -- 4 files changed, 1289 deletions(-) delete mode 100644 drivers/mtd/nand/raw/s3c2410.c delete mode 100644 include/linux/platform_data/mtd-nand-s3c2410.h (limited to 'include/linux') diff --git a/drivers/mtd/nand/raw/Kconfig b/drivers/mtd/nand/raw/Kconfig index 4b99d9c422c3..f053806333c3 100644 --- a/drivers/mtd/nand/raw/Kconfig +++ b/drivers/mtd/nand/raw/Kconfig @@ -77,32 +77,6 @@ config MTD_NAND_NDFC help NDFC Nand Flash Controllers are integrated in IBM/AMCC's 4xx SoCs -config MTD_NAND_S3C2410 - tristate "Samsung S3C NAND controller" - depends on ARCH_S3C64XX - help - This enables the NAND flash controller on the S3C24xx and S3C64xx - SoCs - - No board specific support is done by this driver, each board - must advertise a platform_device for the driver to attach. - -config MTD_NAND_S3C2410_DEBUG - bool "Samsung S3C NAND controller debug" - depends on MTD_NAND_S3C2410 - help - Enable debugging of the S3C NAND driver - -config MTD_NAND_S3C2410_CLKSTOP - bool "Samsung S3C NAND IDLE clock stop" - depends on MTD_NAND_S3C2410 - default n - help - Stop the clock to the NAND controller when there is no chip - selected to save power. This will mean there is a small delay - when the is NAND chip selected or released, but will save - approximately 5mA of power when there is nothing happening. - config MTD_NAND_SHARPSL tristate "Sharp SL Series (C7xx + others) NAND controller" depends on ARCH_PXA || COMPILE_TEST diff --git a/drivers/mtd/nand/raw/Makefile b/drivers/mtd/nand/raw/Makefile index 711d043ad4f8..363cd18eaf16 100644 --- a/drivers/mtd/nand/raw/Makefile +++ b/drivers/mtd/nand/raw/Makefile @@ -9,7 +9,6 @@ obj-$(CONFIG_MTD_NAND_DENALI) += denali.o obj-$(CONFIG_MTD_NAND_DENALI_PCI) += denali_pci.o obj-$(CONFIG_MTD_NAND_DENALI_DT) += denali_dt.o obj-$(CONFIG_MTD_NAND_AU1550) += au1550nd.o -obj-$(CONFIG_MTD_NAND_S3C2410) += s3c2410.o obj-$(CONFIG_MTD_NAND_DAVINCI) += davinci_nand.o obj-$(CONFIG_MTD_NAND_DISKONCHIP) += diskonchip.o obj-$(CONFIG_MTD_NAND_FSMC) += fsmc_nand.o diff --git a/drivers/mtd/nand/raw/s3c2410.c b/drivers/mtd/nand/raw/s3c2410.c deleted file mode 100644 index 33130d75c5ba..000000000000 --- a/drivers/mtd/nand/raw/s3c2410.c +++ /dev/null @@ -1,1192 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright © 2004-2008 Simtec Electronics - * http://armlinux.simtec.co.uk/ - * Ben Dooks - * - * Samsung S3C2410/S3C2440/S3C2412 NAND driver -*/ - -#define pr_fmt(fmt) "nand-s3c2410: " fmt - -#ifdef CONFIG_MTD_NAND_S3C2410_DEBUG -#define DEBUG -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -#define S3C2410_NFREG(x) (x) - -#define S3C2410_NFCONF S3C2410_NFREG(0x00) -#define S3C2410_NFCMD S3C2410_NFREG(0x04) -#define S3C2410_NFADDR S3C2410_NFREG(0x08) -#define S3C2410_NFDATA S3C2410_NFREG(0x0C) -#define S3C2410_NFSTAT S3C2410_NFREG(0x10) -#define S3C2410_NFECC S3C2410_NFREG(0x14) -#define S3C2440_NFCONT S3C2410_NFREG(0x04) -#define S3C2440_NFCMD S3C2410_NFREG(0x08) -#define S3C2440_NFADDR S3C2410_NFREG(0x0C) -#define S3C2440_NFDATA S3C2410_NFREG(0x10) -#define S3C2440_NFSTAT S3C2410_NFREG(0x20) -#define S3C2440_NFMECC0 S3C2410_NFREG(0x2C) -#define S3C2412_NFSTAT S3C2410_NFREG(0x28) -#define S3C2412_NFMECC0 S3C2410_NFREG(0x34) -#define S3C2410_NFCONF_EN (1<<15) -#define S3C2410_NFCONF_INITECC (1<<12) -#define S3C2410_NFCONF_nFCE (1<<11) -#define S3C2410_NFCONF_TACLS(x) ((x)<<8) -#define S3C2410_NFCONF_TWRPH0(x) ((x)<<4) -#define S3C2410_NFCONF_TWRPH1(x) ((x)<<0) -#define S3C2410_NFSTAT_BUSY (1<<0) -#define S3C2440_NFCONF_TACLS(x) ((x)<<12) -#define S3C2440_NFCONF_TWRPH0(x) ((x)<<8) -#define S3C2440_NFCONF_TWRPH1(x) ((x)<<4) -#define S3C2440_NFCONT_INITECC (1<<4) -#define S3C2440_NFCONT_nFCE (1<<1) -#define S3C2440_NFCONT_ENABLE (1<<0) -#define S3C2440_NFSTAT_READY (1<<0) -#define S3C2412_NFCONF_NANDBOOT (1<<31) -#define S3C2412_NFCONT_INIT_MAIN_ECC (1<<5) -#define S3C2412_NFCONT_nFCE0 (1<<1) -#define S3C2412_NFSTAT_READY (1<<0) - -/* new oob placement block for use with hardware ecc generation - */ -static int s3c2410_ooblayout_ecc(struct mtd_info *mtd, int section, - struct mtd_oob_region *oobregion) -{ - if (section) - return -ERANGE; - - oobregion->offset = 0; - oobregion->length = 3; - - return 0; -} - -static int s3c2410_ooblayout_free(struct mtd_info *mtd, int section, - struct mtd_oob_region *oobregion) -{ - if (section) - return -ERANGE; - - oobregion->offset = 8; - oobregion->length = 8; - - return 0; -} - -static const struct mtd_ooblayout_ops s3c2410_ooblayout_ops = { - .ecc = s3c2410_ooblayout_ecc, - .free = s3c2410_ooblayout_free, -}; - -/* controller and mtd information */ - -struct s3c2410_nand_info; - -/** - * struct s3c2410_nand_mtd - driver MTD structure - * @chip: The NAND chip information. - * @set: The platform information supplied for this set of NAND chips. - * @info: Link back to the hardware information. -*/ -struct s3c2410_nand_mtd { - struct nand_chip chip; - struct s3c2410_nand_set *set; - struct s3c2410_nand_info *info; -}; - -enum s3c_cpu_type { - TYPE_S3C2410, - TYPE_S3C2412, - TYPE_S3C2440, -}; - -enum s3c_nand_clk_state { - CLOCK_DISABLE = 0, - CLOCK_ENABLE, - CLOCK_SUSPEND, -}; - -/* overview of the s3c2410 nand state */ - -/** - * struct s3c2410_nand_info - NAND controller state. - * @controller: Base controller structure. - * @mtds: An array of MTD instances on this controller. - * @platform: The platform data for this board. - * @device: The platform device we bound to. - * @clk: The clock resource for this controller. - * @regs: The area mapped for the hardware registers. - * @sel_reg: Pointer to the register controlling the NAND selection. - * @sel_bit: The bit in @sel_reg to select the NAND chip. - * @mtd_count: The number of MTDs created from this controller. - * @save_sel: The contents of @sel_reg to be saved over suspend. - * @clk_rate: The clock rate from @clk. - * @clk_state: The current clock state. - * @cpu_type: The exact type of this controller. - */ -struct s3c2410_nand_info { - /* mtd info */ - struct nand_controller controller; - struct s3c2410_nand_mtd *mtds; - struct s3c2410_platform_nand *platform; - - /* device info */ - struct device *device; - struct clk *clk; - void __iomem *regs; - void __iomem *sel_reg; - int sel_bit; - int mtd_count; - unsigned long save_sel; - unsigned long clk_rate; - enum s3c_nand_clk_state clk_state; - - enum s3c_cpu_type cpu_type; -}; - -struct s3c24XX_nand_devtype_data { - enum s3c_cpu_type type; -}; - -/* conversion functions */ - -static struct s3c2410_nand_mtd *s3c2410_nand_mtd_toours(struct mtd_info *mtd) -{ - return container_of(mtd_to_nand(mtd), struct s3c2410_nand_mtd, - chip); -} - -static struct s3c2410_nand_info *s3c2410_nand_mtd_toinfo(struct mtd_info *mtd) -{ - return s3c2410_nand_mtd_toours(mtd)->info; -} - -static struct s3c2410_nand_info *to_nand_info(struct platform_device *dev) -{ - return platform_get_drvdata(dev); -} - -static struct s3c2410_platform_nand *to_nand_plat(struct platform_device *dev) -{ - return dev_get_platdata(&dev->dev); -} - -static inline int allow_clk_suspend(struct s3c2410_nand_info *info) -{ -#ifdef CONFIG_MTD_NAND_S3C2410_CLKSTOP - return 1; -#else - return 0; -#endif -} - -/** - * s3c2410_nand_clk_set_state - Enable, disable or suspend NAND clock. - * @info: The controller instance. - * @new_state: State to which clock should be set. - */ -static void s3c2410_nand_clk_set_state(struct s3c2410_nand_info *info, - enum s3c_nand_clk_state new_state) -{ - if (!allow_clk_suspend(info) && new_state == CLOCK_SUSPEND) - return; - - if (info->clk_state == CLOCK_ENABLE) { - if (new_state != CLOCK_ENABLE) - clk_disable_unprepare(info->clk); - } else { - if (new_state == CLOCK_ENABLE) - clk_prepare_enable(info->clk); - } - - info->clk_state = new_state; -} - -/* timing calculations */ - -#define NS_IN_KHZ 1000000 - -/** - * s3c_nand_calc_rate - calculate timing data. - * @wanted: The cycle time in nanoseconds. - * @clk: The clock rate in kHz. - * @max: The maximum divider value. - * - * Calculate the timing value from the given parameters. - */ -static int s3c_nand_calc_rate(int wanted, unsigned long clk, int max) -{ - int result; - - result = DIV_ROUND_UP((wanted * clk), NS_IN_KHZ); - - pr_debug("result %d from %ld, %d\n", result, clk, wanted); - - if (result > max) { - pr_err("%d ns is too big for current clock rate %ld\n", - wanted, clk); - return -1; - } - - if (result < 1) - result = 1; - - return result; -} - -#define to_ns(ticks, clk) (((ticks) * NS_IN_KHZ) / (unsigned int)(clk)) - -/* controller setup */ - -/** - * s3c2410_nand_setrate - setup controller timing information. - * @info: The controller instance. - * - * Given the information supplied by the platform, calculate and set - * the necessary timing registers in the hardware to generate the - * necessary timing cycles to the hardware. - */ -static int s3c2410_nand_setrate(struct s3c2410_nand_info *info) -{ - struct s3c2410_platform_nand *plat = info->platform; - int tacls_max = (info->cpu_type == TYPE_S3C2412) ? 8 : 4; - int tacls, twrph0, twrph1; - unsigned long clkrate = clk_get_rate(info->clk); - unsigned long set, cfg, mask; - unsigned long flags; - - /* calculate the timing information for the controller */ - - info->clk_rate = clkrate; - clkrate /= 1000; /* turn clock into kHz for ease of use */ - - if (plat != NULL) { - tacls = s3c_nand_calc_rate(plat->tacls, clkrate, tacls_max); - twrph0 = s3c_nand_calc_rate(plat->twrph0, clkrate, 8); - twrph1 = s3c_nand_calc_rate(plat->twrph1, clkrate, 8); - } else { - /* default timings */ - tacls = tacls_max; - twrph0 = 8; - twrph1 = 8; - } - - if (tacls < 0 || twrph0 < 0 || twrph1 < 0) { - dev_err(info->device, "cannot get suitable timings\n"); - return -EINVAL; - } - - dev_info(info->device, "Tacls=%d, %dns Twrph0=%d %dns, Twrph1=%d %dns\n", - tacls, to_ns(tacls, clkrate), twrph0, to_ns(twrph0, clkrate), - twrph1, to_ns(twrph1, clkrate)); - - switch (info->cpu_type) { - case TYPE_S3C2410: - mask = (S3C2410_NFCONF_TACLS(3) | - S3C2410_NFCONF_TWRPH0(7) | - S3C2410_NFCONF_TWRPH1(7)); - set = S3C2410_NFCONF_EN; - set |= S3C2410_NFCONF_TACLS(tacls - 1); - set |= S3C2410_NFCONF_TWRPH0(twrph0 - 1); - set |= S3C2410_NFCONF_TWRPH1(twrph1 - 1); - break; - - case TYPE_S3C2440: - case TYPE_S3C2412: - mask = (S3C2440_NFCONF_TACLS(tacls_max - 1) | - S3C2440_NFCONF_TWRPH0(7) | - S3C2440_NFCONF_TWRPH1(7)); - - set = S3C2440_NFCONF_TACLS(tacls - 1); - set |= S3C2440_NFCONF_TWRPH0(twrph0 - 1); - set |= S3C2440_NFCONF_TWRPH1(twrph1 - 1); - break; - - default: - BUG(); - } - - local_irq_save(flags); - - cfg = readl(info->regs + S3C2410_NFCONF); - cfg &= ~mask; - cfg |= set; - writel(cfg, info->regs + S3C2410_NFCONF); - - local_irq_restore(flags); - - dev_dbg(info->device, "NF_CONF is 0x%lx\n", cfg); - - return 0; -} - -/** - * s3c2410_nand_inithw - basic hardware initialisation - * @info: The hardware state. - * - * Do the basic initialisation of the hardware, using s3c2410_nand_setrate() - * to setup the hardware access speeds and set the controller to be enabled. -*/ -static int s3c2410_nand_inithw(struct s3c2410_nand_info *info) -{ - int ret; - - ret = s3c2410_nand_setrate(info); - if (ret < 0) - return ret; - - switch (info->cpu_type) { - case TYPE_S3C2410: - default: - break; - - case TYPE_S3C2440: - case TYPE_S3C2412: - /* enable the controller and de-assert nFCE */ - - writel(S3C2440_NFCONT_ENABLE, info->regs + S3C2440_NFCONT); - } - - return 0; -} - -/** - * s3c2410_nand_select_chip - select the given nand chip - * @this: NAND chip object. - * @chip: The chip number. - * - * This is called by the MTD layer to either select a given chip for the - * @mtd instance, or to indicate that the access has finished and the - * chip can be de-selected. - * - * The routine ensures that the nFCE line is correctly setup, and any - * platform specific selection code is called to route nFCE to the specific - * chip. - */ -static void s3c2410_nand_select_chip(struct nand_chip *this, int chip) -{ - struct s3c2410_nand_info *info; - struct s3c2410_nand_mtd *nmtd; - unsigned long cur; - - nmtd = nand_get_controller_data(this); - info = nmtd->info; - - if (chip != -1) - s3c2410_nand_clk_set_state(info, CLOCK_ENABLE); - - cur = readl(info->sel_reg); - - if (chip == -1) { - cur |= info->sel_bit; - } else { - if (nmtd->set != NULL && chip > nmtd->set->nr_chips) { - dev_err(info->device, "invalid chip %d\n", chip); - return; - } - - if (info->platform != NULL) { - if (info->platform->select_chip != NULL) - (info->platform->select_chip) (nmtd->set, chip); - } - - cur &= ~info->sel_bit; - } - - writel(cur, info->sel_reg); - - if (chip == -1) - s3c2410_nand_clk_set_state(info, CLOCK_SUSPEND); -} - -/* s3c2410_nand_hwcontrol - * - * Issue command and address cycles to the chip -*/ - -static void s3c2410_nand_hwcontrol(struct nand_chip *chip, int cmd, - unsigned int ctrl) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - if (cmd == NAND_CMD_NONE) - return; - - if (ctrl & NAND_CLE) - writeb(cmd, info->regs + S3C2410_NFCMD); - else - writeb(cmd, info->regs + S3C2410_NFADDR); -} - -/* command and control functions */ - -static void s3c2440_nand_hwcontrol(struct nand_chip *chip, int cmd, - unsigned int ctrl) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - if (cmd == NAND_CMD_NONE) - return; - - if (ctrl & NAND_CLE) - writeb(cmd, info->regs + S3C2440_NFCMD); - else - writeb(cmd, info->regs + S3C2440_NFADDR); -} - -/* s3c2410_nand_devready() - * - * returns 0 if the nand is busy, 1 if it is ready -*/ - -static int s3c2410_nand_devready(struct nand_chip *chip) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - return readb(info->regs + S3C2410_NFSTAT) & S3C2410_NFSTAT_BUSY; -} - -static int s3c2440_nand_devready(struct nand_chip *chip) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - return readb(info->regs + S3C2440_NFSTAT) & S3C2440_NFSTAT_READY; -} - -static int s3c2412_nand_devready(struct nand_chip *chip) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - return readb(info->regs + S3C2412_NFSTAT) & S3C2412_NFSTAT_READY; -} - -/* ECC handling functions */ - -static int s3c2410_nand_correct_data(struct nand_chip *chip, u_char *dat, - u_char *read_ecc, u_char *calc_ecc) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - unsigned int diff0, diff1, diff2; - unsigned int bit, byte; - - pr_debug("%s(%p,%p,%p,%p)\n", __func__, mtd, dat, read_ecc, calc_ecc); - - diff0 = read_ecc[0] ^ calc_ecc[0]; - diff1 = read_ecc[1] ^ calc_ecc[1]; - diff2 = read_ecc[2] ^ calc_ecc[2]; - - pr_debug("%s: rd %*phN calc %*phN diff %02x%02x%02x\n", - __func__, 3, read_ecc, 3, calc_ecc, - diff0, diff1, diff2); - - if (diff0 == 0 && diff1 == 0 && diff2 == 0) - return 0; /* ECC is ok */ - - /* sometimes people do not think about using the ECC, so check - * to see if we have an 0xff,0xff,0xff read ECC and then ignore - * the error, on the assumption that this is an un-eccd page. - */ - if (read_ecc[0] == 0xff && read_ecc[1] == 0xff && read_ecc[2] == 0xff - && info->platform->ignore_unset_ecc) - return 0; - - /* Can we correct this ECC (ie, one row and column change). - * Note, this is similar to the 256 error code on smartmedia */ - - if (((diff0 ^ (diff0 >> 1)) & 0x55) == 0x55 && - ((diff1 ^ (diff1 >> 1)) & 0x55) == 0x55 && - ((diff2 ^ (diff2 >> 1)) & 0x55) == 0x55) { - /* calculate the bit position of the error */ - - bit = ((diff2 >> 3) & 1) | - ((diff2 >> 4) & 2) | - ((diff2 >> 5) & 4); - - /* calculate the byte position of the error */ - - byte = ((diff2 << 7) & 0x100) | - ((diff1 << 0) & 0x80) | - ((diff1 << 1) & 0x40) | - ((diff1 << 2) & 0x20) | - ((diff1 << 3) & 0x10) | - ((diff0 >> 4) & 0x08) | - ((diff0 >> 3) & 0x04) | - ((diff0 >> 2) & 0x02) | - ((diff0 >> 1) & 0x01); - - dev_dbg(info->device, "correcting error bit %d, byte %d\n", - bit, byte); - - dat[byte] ^= (1 << bit); - return 1; - } - - /* if there is only one bit difference in the ECC, then - * one of only a row or column parity has changed, which - * means the error is most probably in the ECC itself */ - - diff0 |= (diff1 << 8); - diff0 |= (diff2 << 16); - - /* equal to "(diff0 & ~(1 << __ffs(diff0)))" */ - if ((diff0 & (diff0 - 1)) == 0) - return 1; - - return -1; -} - -/* ECC functions - * - * These allow the s3c2410 and s3c2440 to use the controller's ECC - * generator block to ECC the data as it passes through] -*/ - -static void s3c2410_nand_enable_hwecc(struct nand_chip *chip, int mode) -{ - struct s3c2410_nand_info *info; - unsigned long ctrl; - - info = s3c2410_nand_mtd_toinfo(nand_to_mtd(chip)); - ctrl = readl(info->regs + S3C2410_NFCONF); - ctrl |= S3C2410_NFCONF_INITECC; - writel(ctrl, info->regs + S3C2410_NFCONF); -} - -static void s3c2412_nand_enable_hwecc(struct nand_chip *chip, int mode) -{ - struct s3c2410_nand_info *info; - unsigned long ctrl; - - info = s3c2410_nand_mtd_toinfo(nand_to_mtd(chip)); - ctrl = readl(info->regs + S3C2440_NFCONT); - writel(ctrl | S3C2412_NFCONT_INIT_MAIN_ECC, - info->regs + S3C2440_NFCONT); -} - -static void s3c2440_nand_enable_hwecc(struct nand_chip *chip, int mode) -{ - struct s3c2410_nand_info *info; - unsigned long ctrl; - - info = s3c2410_nand_mtd_toinfo(nand_to_mtd(chip)); - ctrl = readl(info->regs + S3C2440_NFCONT); - writel(ctrl | S3C2440_NFCONT_INITECC, info->regs + S3C2440_NFCONT); -} - -static int s3c2410_nand_calculate_ecc(struct nand_chip *chip, - const u_char *dat, u_char *ecc_code) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - ecc_code[0] = readb(info->regs + S3C2410_NFECC + 0); - ecc_code[1] = readb(info->regs + S3C2410_NFECC + 1); - ecc_code[2] = readb(info->regs + S3C2410_NFECC + 2); - - pr_debug("%s: returning ecc %*phN\n", __func__, 3, ecc_code); - - return 0; -} - -static int s3c2412_nand_calculate_ecc(struct nand_chip *chip, - const u_char *dat, u_char *ecc_code) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - unsigned long ecc = readl(info->regs + S3C2412_NFMECC0); - - ecc_code[0] = ecc; - ecc_code[1] = ecc >> 8; - ecc_code[2] = ecc >> 16; - - pr_debug("%s: returning ecc %*phN\n", __func__, 3, ecc_code); - - return 0; -} - -static int s3c2440_nand_calculate_ecc(struct nand_chip *chip, - const u_char *dat, u_char *ecc_code) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - unsigned long ecc = readl(info->regs + S3C2440_NFMECC0); - - ecc_code[0] = ecc; - ecc_code[1] = ecc >> 8; - ecc_code[2] = ecc >> 16; - - pr_debug("%s: returning ecc %06lx\n", __func__, ecc & 0xffffff); - - return 0; -} - -/* over-ride the standard functions for a little more speed. We can - * use read/write block to move the data buffers to/from the controller -*/ - -static void s3c2410_nand_read_buf(struct nand_chip *this, u_char *buf, int len) -{ - readsb(this->legacy.IO_ADDR_R, buf, len); -} - -static void s3c2440_nand_read_buf(struct nand_chip *this, u_char *buf, int len) -{ - struct mtd_info *mtd = nand_to_mtd(this); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - readsl(info->regs + S3C2440_NFDATA, buf, len >> 2); - - /* cleanup if we've got less than a word to do */ - if (len & 3) { - buf += len & ~3; - - for (; len & 3; len--) - *buf++ = readb(info->regs + S3C2440_NFDATA); - } -} - -static void s3c2410_nand_write_buf(struct nand_chip *this, const u_char *buf, - int len) -{ - writesb(this->legacy.IO_ADDR_W, buf, len); -} - -static void s3c2440_nand_write_buf(struct nand_chip *this, const u_char *buf, - int len) -{ - struct mtd_info *mtd = nand_to_mtd(this); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - writesl(info->regs + S3C2440_NFDATA, buf, len >> 2); - - /* cleanup any fractional write */ - if (len & 3) { - buf += len & ~3; - - for (; len & 3; len--, buf++) - writeb(*buf, info->regs + S3C2440_NFDATA); - } -} - -/* device management functions */ - -static void s3c24xx_nand_remove(struct platform_device *pdev) -{ - struct s3c2410_nand_info *info = to_nand_info(pdev); - - if (info == NULL) - return; - - /* Release all our mtds and their partitions, then go through - * freeing the resources used - */ - - if (info->mtds != NULL) { - struct s3c2410_nand_mtd *ptr = info->mtds; - int mtdno; - - for (mtdno = 0; mtdno < info->mtd_count; mtdno++, ptr++) { - pr_debug("releasing mtd %d (%p)\n", mtdno, ptr); - WARN_ON(mtd_device_unregister(nand_to_mtd(&ptr->chip))); - nand_cleanup(&ptr->chip); - } - } - - /* free the common resources */ - - if (!IS_ERR(info->clk)) - s3c2410_nand_clk_set_state(info, CLOCK_DISABLE); -} - -static int s3c2410_nand_add_partition(struct s3c2410_nand_info *info, - struct s3c2410_nand_mtd *mtd, - struct s3c2410_nand_set *set) -{ - if (set) { - struct mtd_info *mtdinfo = nand_to_mtd(&mtd->chip); - - mtdinfo->name = set->name; - - return mtd_device_register(mtdinfo, set->partitions, - set->nr_partitions); - } - - return -ENODEV; -} - -static int s3c2410_nand_setup_interface(struct nand_chip *chip, int csline, - const struct nand_interface_config *conf) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - struct s3c2410_platform_nand *pdata = info->platform; - const struct nand_sdr_timings *timings; - int tacls; - - timings = nand_get_sdr_timings(conf); - if (IS_ERR(timings)) - return -ENOTSUPP; - - tacls = timings->tCLS_min - timings->tWP_min; - if (tacls < 0) - tacls = 0; - - pdata->tacls = DIV_ROUND_UP(tacls, 1000); - pdata->twrph0 = DIV_ROUND_UP(timings->tWP_min, 1000); - pdata->twrph1 = DIV_ROUND_UP(timings->tCLH_min, 1000); - - return s3c2410_nand_setrate(info); -} - -/** - * s3c2410_nand_init_chip - initialise a single instance of an chip - * @info: The base NAND controller the chip is on. - * @nmtd: The new controller MTD instance to fill in. - * @set: The information passed from the board specific platform data. - * - * Initialise the given @nmtd from the information in @info and @set. This - * readies the structure for use with the MTD layer functions by ensuring - * all pointers are setup and the necessary control routines selected. - */ -static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info, - struct s3c2410_nand_mtd *nmtd, - struct s3c2410_nand_set *set) -{ - struct device_node *np = info->device->of_node; - struct nand_chip *chip = &nmtd->chip; - void __iomem *regs = info->regs; - - nand_set_flash_node(chip, set->of_node); - - chip->legacy.write_buf = s3c2410_nand_write_buf; - chip->legacy.read_buf = s3c2410_nand_read_buf; - chip->legacy.select_chip = s3c2410_nand_select_chip; - chip->legacy.chip_delay = 50; - nand_set_controller_data(chip, nmtd); - chip->options = set->options; - chip->controller = &info->controller; - - /* - * let's keep behavior unchanged for legacy boards booting via pdata and - * auto-detect timings only when booting with a device tree. - */ - if (!np) - chip->options |= NAND_KEEP_TIMINGS; - - switch (info->cpu_type) { - case TYPE_S3C2410: - chip->legacy.IO_ADDR_W = regs + S3C2410_NFDATA; - info->sel_reg = regs + S3C2410_NFCONF; - info->sel_bit = S3C2410_NFCONF_nFCE; - chip->legacy.cmd_ctrl = s3c2410_nand_hwcontrol; - chip->legacy.dev_ready = s3c2410_nand_devready; - break; - - case TYPE_S3C2440: - chip->legacy.IO_ADDR_W = regs + S3C2440_NFDATA; - info->sel_reg = regs + S3C2440_NFCONT; - info->sel_bit = S3C2440_NFCONT_nFCE; - chip->legacy.cmd_ctrl = s3c2440_nand_hwcontrol; - chip->legacy.dev_ready = s3c2440_nand_devready; - chip->legacy.read_buf = s3c2440_nand_read_buf; - chip->legacy.write_buf = s3c2440_nand_write_buf; - break; - - case TYPE_S3C2412: - chip->legacy.IO_ADDR_W = regs + S3C2440_NFDATA; - info->sel_reg = regs + S3C2440_NFCONT; - info->sel_bit = S3C2412_NFCONT_nFCE0; - chip->legacy.cmd_ctrl = s3c2440_nand_hwcontrol; - chip->legacy.dev_ready = s3c2412_nand_devready; - - if (readl(regs + S3C2410_NFCONF) & S3C2412_NFCONF_NANDBOOT) - dev_info(info->device, "System booted from NAND\n"); - - break; - } - - chip->legacy.IO_ADDR_R = chip->legacy.IO_ADDR_W; - - nmtd->info = info; - nmtd->set = set; - - chip->ecc.engine_type = info->platform->engine_type; - - /* - * If you use u-boot BBT creation code, specifying this flag will - * let the kernel fish out the BBT from the NAND. - */ - if (set->flash_bbt) - chip->bbt_options |= NAND_BBT_USE_FLASH; -} - -/** - * s3c2410_nand_attach_chip - Init the ECC engine after NAND scan - * @chip: The NAND chip - * - * This hook is called by the core after the identification of the NAND chip, - * once the relevant per-chip information is up to date.. This call ensure that - * we update the internal state accordingly. - * - * The internal state is currently limited to the ECC state information. -*/ -static int s3c2410_nand_attach_chip(struct nand_chip *chip) -{ - struct mtd_info *mtd = nand_to_mtd(chip); - struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); - - switch (chip->ecc.engine_type) { - - case NAND_ECC_ENGINE_TYPE_NONE: - dev_info(info->device, "ECC disabled\n"); - break; - - case NAND_ECC_ENGINE_TYPE_SOFT: - /* - * This driver expects Hamming based ECC when engine_type is set - * to NAND_ECC_ENGINE_TYPE_SOFT. Force ecc.algo to - * NAND_ECC_ALGO_HAMMING to avoid adding an extra ecc_algo field - * to s3c2410_platform_nand. - */ - chip->ecc.algo = NAND_ECC_ALGO_HAMMING; - dev_info(info->device, "soft ECC\n"); - break; - - case NAND_ECC_ENGINE_TYPE_ON_HOST: - chip->ecc.calculate = s3c2410_nand_calculate_ecc; - chip->ecc.correct = s3c2410_nand_correct_data; - chip->ecc.strength = 1; - - switch (info->cpu_type) { - case TYPE_S3C2410: - chip->ecc.hwctl = s3c2410_nand_enable_hwecc; - chip->ecc.calculate = s3c2410_nand_calculate_ecc; - break; - - case TYPE_S3C2412: - chip->ecc.hwctl = s3c2412_nand_enable_hwecc; - chip->ecc.calculate = s3c2412_nand_calculate_ecc; - break; - - case TYPE_S3C2440: - chip->ecc.hwctl = s3c2440_nand_enable_hwecc; - chip->ecc.calculate = s3c2440_nand_calculate_ecc; - break; - } - - dev_dbg(info->device, "chip %p => page shift %d\n", - chip, chip->page_shift); - - /* change the behaviour depending on whether we are using - * the large or small page nand device */ - if (chip->page_shift > 10) { - chip->ecc.size = 256; - chip->ecc.bytes = 3; - } else { - chip->ecc.size = 512; - chip->ecc.bytes = 3; - mtd_set_ooblayout(nand_to_mtd(chip), - &s3c2410_ooblayout_ops); - } - - dev_info(info->device, "hardware ECC\n"); - break; - - default: - dev_err(info->device, "invalid ECC mode!\n"); - return -EINVAL; - } - - if (chip->bbt_options & NAND_BBT_USE_FLASH) - chip->options |= NAND_SKIP_BBTSCAN; - - return 0; -} - -static const struct nand_controller_ops s3c24xx_nand_controller_ops = { - .attach_chip = s3c2410_nand_attach_chip, - .setup_interface = s3c2410_nand_setup_interface, -}; - -static int s3c24xx_nand_probe_dt(struct platform_device *pdev) -{ - const struct s3c24XX_nand_devtype_data *devtype_data; - struct s3c2410_platform_nand *pdata; - struct s3c2410_nand_info *info = platform_get_drvdata(pdev); - struct device_node *np = pdev->dev.of_node, *child; - struct s3c2410_nand_set *sets; - - devtype_data = of_device_get_match_data(&pdev->dev); - if (!devtype_data) - return -ENODEV; - - info->cpu_type = devtype_data->type; - - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) - return -ENOMEM; - - pdev->dev.platform_data = pdata; - - pdata->nr_sets = of_get_child_count(np); - if (!pdata->nr_sets) - return 0; - - sets = devm_kcalloc(&pdev->dev, pdata->nr_sets, sizeof(*sets), - GFP_KERNEL); - if (!sets) - return -ENOMEM; - - pdata->sets = sets; - - for_each_available_child_of_node(np, child) { - sets->name = (char *)child->name; - sets->of_node = child; - sets->nr_chips = 1; - - of_node_get(child); - - sets++; - } - - return 0; -} - -static int s3c24xx_nand_probe_pdata(struct platform_device *pdev) -{ - struct s3c2410_nand_info *info = platform_get_drvdata(pdev); - - info->cpu_type = platform_get_device_id(pdev)->driver_data; - - return 0; -} - -/* s3c24xx_nand_probe - * - * called by device layer when it finds a device matching - * one our driver can handled. This code checks to see if - * it can allocate all necessary resources then calls the - * nand layer to look for devices -*/ -static int s3c24xx_nand_probe(struct platform_device *pdev) -{ - struct s3c2410_platform_nand *plat; - struct s3c2410_nand_info *info; - struct s3c2410_nand_mtd *nmtd; - struct s3c2410_nand_set *sets; - struct resource *res; - int err = 0; - int size; - int nr_sets; - int setno; - - info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL); - if (info == NULL) { - err = -ENOMEM; - goto exit_error; - } - - platform_set_drvdata(pdev, info); - - nand_controller_init(&info->controller); - info->controller.ops = &s3c24xx_nand_controller_ops; - - /* get the clock source and enable it */ - - info->clk = devm_clk_get(&pdev->dev, "nand"); - if (IS_ERR(info->clk)) { - dev_err(&pdev->dev, "failed to get clock\n"); - err = -ENOENT; - goto exit_error; - } - - s3c2410_nand_clk_set_state(info, CLOCK_ENABLE); - - if (pdev->dev.of_node) - err = s3c24xx_nand_probe_dt(pdev); - else - err = s3c24xx_nand_probe_pdata(pdev); - - if (err) - goto exit_error; - - plat = to_nand_plat(pdev); - - /* allocate and map the resource */ - - /* currently we assume we have the one resource */ - res = pdev->resource; - size = resource_size(res); - - info->device = &pdev->dev; - info->platform = plat; - - info->regs = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(info->regs)) { - err = PTR_ERR(info->regs); - goto exit_error; - } - - dev_dbg(&pdev->dev, "mapped registers at %p\n", info->regs); - - if (!plat->sets || plat->nr_sets < 1) { - err = -EINVAL; - goto exit_error; - } - - sets = plat->sets; - nr_sets = plat->nr_sets; - - info->mtd_count = nr_sets; - - /* allocate our information */ - - size = nr_sets * sizeof(*info->mtds); - info->mtds = devm_kzalloc(&pdev->dev, size, GFP_KERNEL); - if (info->mtds == NULL) { - err = -ENOMEM; - goto exit_error; - } - - /* initialise all possible chips */ - - nmtd = info->mtds; - - for (setno = 0; setno < nr_sets; setno++, nmtd++, sets++) { - struct mtd_info *mtd = nand_to_mtd(&nmtd->chip); - - pr_debug("initialising set %d (%p, info %p)\n", - setno, nmtd, info); - - mtd->dev.parent = &pdev->dev; - s3c2410_nand_init_chip(info, nmtd, sets); - - err = nand_scan(&nmtd->chip, sets ? sets->nr_chips : 1); - if (err) - goto exit_error; - - s3c2410_nand_add_partition(info, nmtd, sets); - } - - /* initialise the hardware */ - err = s3c2410_nand_inithw(info); - if (err != 0) - goto exit_error; - - if (allow_clk_suspend(info)) { - dev_info(&pdev->dev, "clock idle support enabled\n"); - s3c2410_nand_clk_set_state(info, CLOCK_SUSPEND); - } - - return 0; - - exit_error: - s3c24xx_nand_remove(pdev); - - if (err == 0) - err = -EINVAL; - return err; -} - -/* PM Support */ -#ifdef CONFIG_PM - -static int s3c24xx_nand_suspend(struct platform_device *dev, pm_message_t pm) -{ - struct s3c2410_nand_info *info = platform_get_drvdata(dev); - - if (info) { - info->save_sel = readl(info->sel_reg); - - /* For the moment, we must ensure nFCE is high during - * the time we are suspended. This really should be - * handled by suspending the MTDs we are using, but - * that is currently not the case. */ - - writel(info->save_sel | info->sel_bit, info->sel_reg); - - s3c2410_nand_clk_set_state(info, CLOCK_DISABLE); - } - - return 0; -} - -static int s3c24xx_nand_resume(struct platform_device *dev) -{ - struct s3c2410_nand_info *info = platform_get_drvdata(dev); - unsigned long sel; - - if (info) { - s3c2410_nand_clk_set_state(info, CLOCK_ENABLE); - s3c2410_nand_inithw(info); - - /* Restore the state of the nFCE line. */ - - sel = readl(info->sel_reg); - sel &= ~info->sel_bit; - sel |= info->save_sel & info->sel_bit; - writel(sel, info->sel_reg); - - s3c2410_nand_clk_set_state(info, CLOCK_SUSPEND); - } - - return 0; -} - -#else -#define s3c24xx_nand_suspend NULL -#define s3c24xx_nand_resume NULL -#endif - -/* driver device registration */ - -static const struct platform_device_id s3c24xx_driver_ids[] = { - { - .name = "s3c6400-nand", - .driver_data = TYPE_S3C2412, /* compatible with 2412 */ - }, - { } -}; - -MODULE_DEVICE_TABLE(platform, s3c24xx_driver_ids); - -static struct platform_driver s3c24xx_nand_driver = { - .probe = s3c24xx_nand_probe, - .remove = s3c24xx_nand_remove, - .suspend = s3c24xx_nand_suspend, - .resume = s3c24xx_nand_resume, - .id_table = s3c24xx_driver_ids, - .driver = { - .name = "s3c24xx-nand", - }, -}; - -module_platform_driver(s3c24xx_nand_driver); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Ben Dooks "); -MODULE_DESCRIPTION("S3C24XX MTD NAND driver"); diff --git a/include/linux/platform_data/mtd-nand-s3c2410.h b/include/linux/platform_data/mtd-nand-s3c2410.h deleted file mode 100644 index 25390fc3e795..000000000000 --- a/include/linux/platform_data/mtd-nand-s3c2410.h +++ /dev/null @@ -1,70 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2004 Simtec Electronics - * Ben Dooks - * - * S3C2410 - NAND device controller platform_device info -*/ - -#ifndef __MTD_NAND_S3C2410_H -#define __MTD_NAND_S3C2410_H - -#include - -/** - * struct s3c2410_nand_set - define a set of one or more nand chips - * @flash_bbt: Openmoko u-boot can create a Bad Block Table - * Setting this flag will allow the kernel to - * look for it at boot time and also skip the NAND - * scan. - * @options: Default value to set into 'struct nand_chip' options. - * @nr_chips: Number of chips in this set - * @nr_partitions: Number of partitions pointed to by @partitions - * @name: Name of set (optional) - * @nr_map: Map for low-layer logical to physical chip numbers (option) - * @partitions: The mtd partition list - * - * define a set of one or more nand chips registered with an unique mtd. Also - * allows to pass flag to the underlying NAND layer. 'disable_ecc' will trigger - * a warning at boot time. - */ -struct s3c2410_nand_set { - unsigned int flash_bbt:1; - - unsigned int options; - int nr_chips; - int nr_partitions; - char *name; - int *nr_map; - struct mtd_partition *partitions; - struct device_node *of_node; -}; - -struct s3c2410_platform_nand { - /* timing information for controller, all times in nanoseconds */ - - int tacls; /* time for active CLE/ALE to nWE/nOE */ - int twrph0; /* active time for nWE/nOE */ - int twrph1; /* time for release CLE/ALE from nWE/nOE inactive */ - - unsigned int ignore_unset_ecc:1; - - enum nand_ecc_engine_type engine_type; - - int nr_sets; - struct s3c2410_nand_set *sets; - - void (*select_chip)(struct s3c2410_nand_set *, - int chip); -}; - -/** - * s3c_nand_set_platdata() - register NAND platform data. - * @nand: The NAND platform data to register with s3c_device_nand. - * - * This function copies the given NAND platform data, @nand and registers - * it with the s3c_device_nand. This allows @nand to be __initdata. -*/ -extern void s3c_nand_set_platdata(struct s3c2410_platform_nand *nand); - -#endif /*__MTD_NAND_S3C2410_H */ -- cgit v1.2.3 From 7df87820122acd3204565109f636a1367912655a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 5 Aug 2025 15:45:08 +1000 Subject: pidns: move is-ancestor logic to helper This check will be needed in later patches, and there's no point open-coding it each time. Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/20250805-procfs-pidns-api-v4-1-705f984940e7@cyphar.com Signed-off-by: Christian Brauner --- include/linux/pid_namespace.h | 9 +++++++++ kernel/pid_namespace.c | 22 ++++++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..17fdc059f8da 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -84,6 +84,9 @@ extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); +extern bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor); + #else /* !CONFIG_PID_NS */ #include @@ -118,6 +121,12 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } + +static inline bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + return false; +} #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..b7b45c2597ec 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -390,11 +390,23 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } +bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + struct pid_namespace *ns; + + if (child->level < ancestor->level) + return false; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return ns == ancestor; +} + static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = to_pid_ns(ns); + struct pid_namespace *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) @@ -408,13 +420,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns) * this maintains the property that processes and their * children can not escape their current pid namespace. */ - if (new->level < active->level) - return -EINVAL; - - ancestor = new; - while (ancestor->level > active->level) - ancestor = ancestor->parent; - if (ancestor != active) + if (!pidns_is_ancestor(new, active)) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); -- cgit v1.2.3 From 72ca981dba5e98c2b1c2956016cc4be934d9fbea Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sun, 31 Aug 2025 14:52:37 +0800 Subject: firmware: arm_scmi: Fix function name typo in scmi_perf_proto_ops struct The performance protocol ops table incorrectly referenced power_scale_mw_get instead of the correct power_scale_get. Fix the typo to use the proper function. Signed-off-by: Peng Fan Message-Id: <20250831-scmi-cpufreq-v1-1-493031cf6e9b@nxp.com> Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 688466a0e816..aafaac1496b0 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -153,7 +153,7 @@ struct scmi_perf_domain_info { * for a given device * @fast_switch_rate_limit: gets the minimum time (us) required between * successive fast_switching requests - * @power_scale_mw_get: indicates if the power values provided are in milliWatts + * @power_scale_get: indicates if the power values provided are in milliWatts * or in some other (abstract) scale */ struct scmi_perf_proto_ops { -- cgit v1.2.3 From 61f132ca8c46ffee368a951e516d19d4ae767ea8 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 29 Aug 2025 13:06:04 +0800 Subject: ptp: add helpers to get the phc_index by of_node or dev Some Ethernet controllers do not have an integrated PTP timer function. Instead, the PTP timer is a separated device and provides PTP hardware clock to the Ethernet controller to use. Therefore, the Ethernet controller driver needs to obtain the PTP clock's phc_index in its ethtool_ops::get_ts_info(). Currently, most drivers implement this in the following ways. 1. The PTP device driver adds a custom API and exports it to the Ethernet controller driver. 2. The PTP device driver adds private data to its device structure. So the private data structure needs to be exposed to the Ethernet controller driver. When registering the ptp clock, ptp_clock_register() always saves the ptp_clock pointer to the private data of ptp_clock::dev. Therefore, as long as ptp_clock::dev is obtained, the phc_index can be obtained. So the following generic APIs can be added to the ptp driver to obtain the phc_index. 1. ptp_clock_index_by_dev(): Obtain the phc_index by the device pointer of the PTP device. 2.ptp_clock_index_by_of_node(): Obtain the phc_index by the of_node pointer of the PTP device. Also, we can add another API like ptp_clock_index_by_fwnode() to get the phc_index by fwnode of PTP device. However, this API is not used in this patch set, so it is better to add it when needed. Suggested-by: Vladimir Oltean Signed-off-by: Wei Fang Reviewed-by: Frank Li Link: https://patch.msgid.link/20250829050615.1247468-4-wei.fang@nxp.com Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_clock.c | 53 ++++++++++++++++++++++++++++++++++++++++ include/linux/ptp_clock_kernel.h | 22 +++++++++++++++++ 2 files changed, 75 insertions(+) (limited to 'include/linux') diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 3e0726c6f55b..5739a57958c7 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -488,6 +489,58 @@ int ptp_clock_index(struct ptp_clock *ptp) } EXPORT_SYMBOL(ptp_clock_index); +static int ptp_clock_of_node_match(struct device *dev, const void *data) +{ + const struct device_node *parent_np = data; + + return (dev->parent && dev_of_node(dev->parent) == parent_np); +} + +int ptp_clock_index_by_of_node(struct device_node *np) +{ + struct ptp_clock *ptp; + struct device *dev; + int phc_index; + + dev = class_find_device(&ptp_class, NULL, np, + ptp_clock_of_node_match); + if (!dev) + return -1; + + ptp = dev_get_drvdata(dev); + phc_index = ptp_clock_index(ptp); + put_device(dev); + + return phc_index; +} +EXPORT_SYMBOL_GPL(ptp_clock_index_by_of_node); + +static int ptp_clock_dev_match(struct device *dev, const void *data) +{ + const struct device *parent = data; + + return dev->parent == parent; +} + +int ptp_clock_index_by_dev(struct device *parent) +{ + struct ptp_clock *ptp; + struct device *dev; + int phc_index; + + dev = class_find_device(&ptp_class, NULL, parent, + ptp_clock_dev_match); + if (!dev) + return -1; + + ptp = dev_get_drvdata(dev); + phc_index = ptp_clock_index(ptp); + put_device(dev); + + return phc_index; +} +EXPORT_SYMBOL_GPL(ptp_clock_index_by_dev); + int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 3d089bd4d5e9..7dd7951b23d5 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -360,6 +360,24 @@ extern void ptp_clock_event(struct ptp_clock *ptp, extern int ptp_clock_index(struct ptp_clock *ptp); +/** + * ptp_clock_index_by_of_node() - obtain the device index of + * a PTP clock based on the PTP device of_node + * + * @np: The device of_node pointer of the PTP device. + * Return: The PHC index on success or -1 on failure. + */ +int ptp_clock_index_by_of_node(struct device_node *np); + +/** + * ptp_clock_index_by_dev() - obtain the device index of + * a PTP clock based on the PTP device. + * + * @parent: The parent device (PTP device) pointer of the PTP clock. + * Return: The PHC index on success or -1 on failure. + */ +int ptp_clock_index_by_dev(struct device *parent); + /** * ptp_find_pin() - obtain the pin index of a given auxiliary function * @@ -425,6 +443,10 @@ static inline void ptp_clock_event(struct ptp_clock *ptp, { } static inline int ptp_clock_index(struct ptp_clock *ptp) { return -1; } +static inline int ptp_clock_index_by_of_node(struct device_node *np) +{ return -1; } +static inline int ptp_clock_index_by_dev(struct device *parent) +{ return -1; } static inline int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { return -1; } -- cgit v1.2.3 From e551fa3159e3050c26ff010c3b595b45d7eb071a Mon Sep 17 00:00:00 2001 From: Qunqin Zhao Date: Sat, 5 Jul 2025 15:20:42 +0800 Subject: mfd: Add support for Loongson Security Engine chip controller Loongson Security Engine chip supports RNG, SM2, SM3 and SM4 accelerator engines. This is the base driver for other specific engine drivers. Co-developed-by: Yinggang Gu Signed-off-by: Yinggang Gu Signed-off-by: Qunqin Zhao Reviewed-by: Huacai Chen Link: https://lore.kernel.org/r/20250705072045.1067-2-zhaoqunqin@loongson.cn Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 11 ++ drivers/mfd/Makefile | 2 + drivers/mfd/loongson-se.c | 253 ++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/loongson-se.h | 53 +++++++++ 4 files changed, 319 insertions(+) create mode 100644 drivers/mfd/loongson-se.c create mode 100644 include/linux/mfd/loongson-se.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1..d30806ae4298 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -2428,6 +2428,17 @@ config MFD_INTEL_M10_BMC_PMCI additional drivers must be enabled in order to use the functionality of the device. +config MFD_LOONGSON_SE + tristate "Loongson Security Engine chip controller driver" + depends on LOONGARCH && ACPI + select MFD_CORE + help + The Loongson Security Engine chip supports RNG, SM2, SM3 and + SM4 accelerator engines. Each engine have its own DMA buffer + provided by the controller. The kernel cannot directly send + commands to the engine and must first send them to the controller, + which will forward them to the corresponding engine. + config MFD_QNAP_MCU tristate "QNAP microcontroller unit core driver" depends on SERIAL_DEV_BUS diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d..73df7fc8b5ff 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -295,3 +295,5 @@ obj-$(CONFIG_MFD_RSMU_I2C) += rsmu_i2c.o rsmu_core.o obj-$(CONFIG_MFD_RSMU_SPI) += rsmu_spi.o rsmu_core.o obj-$(CONFIG_MFD_UPBOARD_FPGA) += upboard-fpga.o + +obj-$(CONFIG_MFD_LOONGSON_SE) += loongson-se.o diff --git a/drivers/mfd/loongson-se.c b/drivers/mfd/loongson-se.c new file mode 100644 index 000000000000..3902ba377d69 --- /dev/null +++ b/drivers/mfd/loongson-se.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2025 Loongson Technology Corporation Limited + * + * Author: Yinggang Gu + * Author: Qunqin Zhao + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct loongson_se { + void __iomem *base; + spinlock_t dev_lock; + struct completion cmd_completion; + + void *dmam_base; + int dmam_size; + + struct mutex engine_init_lock; + struct loongson_se_engine engines[SE_ENGINE_MAX]; +}; + +struct loongson_se_controller_cmd { + u32 command_id; + u32 info[7]; +}; + +static int loongson_se_poll(struct loongson_se *se, u32 int_bit) +{ + u32 status; + int err; + + spin_lock_irq(&se->dev_lock); + + /* Notify the controller that the engine needs to be started */ + writel(int_bit, se->base + SE_L2SINT_SET); + + /* Polling until the controller has forwarded the engine command */ + err = readl_relaxed_poll_timeout_atomic(se->base + SE_L2SINT_STAT, status, + !(status & int_bit), + 1, LOONGSON_ENGINE_CMD_TIMEOUT_US); + + spin_unlock_irq(&se->dev_lock); + + return err; +} + +static int loongson_se_send_controller_cmd(struct loongson_se *se, + struct loongson_se_controller_cmd *cmd) +{ + u32 *send_cmd = (u32 *)cmd; + int err, i; + + for (i = 0; i < SE_SEND_CMD_REG_LEN; i++) + writel(send_cmd[i], se->base + SE_SEND_CMD_REG + i * 4); + + err = loongson_se_poll(se, SE_INT_CONTROLLER); + if (err) + return err; + + return wait_for_completion_interruptible(&se->cmd_completion); +} + +int loongson_se_send_engine_cmd(struct loongson_se_engine *engine) +{ + /* + * After engine initialization, the controller already knows + * where to obtain engine commands from. Now all we need to + * do is notify the controller that the engine needs to be started. + */ + int err = loongson_se_poll(engine->se, BIT(engine->id)); + + if (err) + return err; + + return wait_for_completion_interruptible(&engine->completion); +} +EXPORT_SYMBOL_GPL(loongson_se_send_engine_cmd); + +struct loongson_se_engine *loongson_se_init_engine(struct device *dev, int id) +{ + struct loongson_se *se = dev_get_drvdata(dev); + struct loongson_se_engine *engine = &se->engines[id]; + struct loongson_se_controller_cmd cmd; + + engine->se = se; + engine->id = id; + init_completion(&engine->completion); + + /* Divide DMA memory equally among all engines */ + engine->buffer_size = se->dmam_size / SE_ENGINE_MAX; + engine->buffer_off = (se->dmam_size / SE_ENGINE_MAX) * id; + engine->data_buffer = se->dmam_base + engine->buffer_off; + + /* + * There has no engine0, use its data buffer as command buffer for other + * engines. The DMA memory size is obtained from the ACPI table, which + * ensures that the data buffer size of engine0 is larger than the + * command buffer size of all engines. + */ + engine->command = se->dmam_base + id * (2 * SE_ENGINE_CMD_SIZE); + engine->command_ret = engine->command + SE_ENGINE_CMD_SIZE; + + mutex_lock(&se->engine_init_lock); + + /* Tell the controller where to find engine command */ + cmd.command_id = SE_CMD_SET_ENGINE_CMDBUF; + cmd.info[0] = id; + cmd.info[1] = engine->command - se->dmam_base; + cmd.info[2] = 2 * SE_ENGINE_CMD_SIZE; + + if (loongson_se_send_controller_cmd(se, &cmd)) + engine = NULL; + + mutex_unlock(&se->engine_init_lock); + + return engine; +} +EXPORT_SYMBOL_GPL(loongson_se_init_engine); + +static irqreturn_t se_irq_handler(int irq, void *dev_id) +{ + struct loongson_se *se = dev_id; + u32 int_status; + int id; + + spin_lock(&se->dev_lock); + + int_status = readl(se->base + SE_S2LINT_STAT); + + /* For controller */ + if (int_status & SE_INT_CONTROLLER) { + complete(&se->cmd_completion); + int_status &= ~SE_INT_CONTROLLER; + writel(SE_INT_CONTROLLER, se->base + SE_S2LINT_CL); + } + + /* For engines */ + while (int_status) { + id = __ffs(int_status); + complete(&se->engines[id].completion); + int_status &= ~BIT(id); + writel(BIT(id), se->base + SE_S2LINT_CL); + } + + spin_unlock(&se->dev_lock); + + return IRQ_HANDLED; +} + +static int loongson_se_init(struct loongson_se *se, dma_addr_t addr, int size) +{ + struct loongson_se_controller_cmd cmd; + int err; + + cmd.command_id = SE_CMD_START; + err = loongson_se_send_controller_cmd(se, &cmd); + if (err) + return err; + + cmd.command_id = SE_CMD_SET_DMA; + cmd.info[0] = lower_32_bits(addr); + cmd.info[1] = upper_32_bits(addr); + cmd.info[2] = size; + + return loongson_se_send_controller_cmd(se, &cmd); +} + +static const struct mfd_cell engines[] = { + { .name = "loongson-rng" }, + { .name = "tpm_loongson" }, +}; + +static int loongson_se_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct loongson_se *se; + int nr_irq, irq, err, i; + dma_addr_t paddr; + + se = devm_kmalloc(dev, sizeof(*se), GFP_KERNEL); + if (!se) + return -ENOMEM; + + dev_set_drvdata(dev, se); + init_completion(&se->cmd_completion); + spin_lock_init(&se->dev_lock); + mutex_init(&se->engine_init_lock); + + dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); + if (device_property_read_u32(dev, "dmam_size", &se->dmam_size)) + return -ENODEV; + + se->dmam_base = dmam_alloc_coherent(dev, se->dmam_size, &paddr, GFP_KERNEL); + if (!se->dmam_base) + return -ENOMEM; + + se->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(se->base)) + return PTR_ERR(se->base); + + writel(SE_INT_ALL, se->base + SE_S2LINT_EN); + + nr_irq = platform_irq_count(pdev); + if (nr_irq <= 0) + return -ENODEV; + + for (i = 0; i < nr_irq; i++) { + irq = platform_get_irq(pdev, i); + err = devm_request_irq(dev, irq, se_irq_handler, 0, "loongson-se", se); + if (err) + dev_err(dev, "failed to request IRQ: %d\n", irq); + } + + err = loongson_se_init(se, paddr, se->dmam_size); + if (err) + return err; + + return devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, engines, + ARRAY_SIZE(engines), NULL, 0, NULL); +} + +static const struct acpi_device_id loongson_se_acpi_match[] = { + { "LOON0011", 0 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, loongson_se_acpi_match); + +static struct platform_driver loongson_se_driver = { + .probe = loongson_se_probe, + .driver = { + .name = "loongson-se", + .acpi_match_table = loongson_se_acpi_match, + }, +}; +module_platform_driver(loongson_se_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yinggang Gu "); +MODULE_AUTHOR("Qunqin Zhao "); +MODULE_DESCRIPTION("Loongson Security Engine chip controller driver"); diff --git a/include/linux/mfd/loongson-se.h b/include/linux/mfd/loongson-se.h new file mode 100644 index 000000000000..07afa0c2524d --- /dev/null +++ b/include/linux/mfd/loongson-se.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Copyright (C) 2025 Loongson Technology Corporation Limited */ + +#ifndef __MFD_LOONGSON_SE_H__ +#define __MFD_LOONGSON_SE_H__ + +#define LOONGSON_ENGINE_CMD_TIMEOUT_US 10000 +#define SE_SEND_CMD_REG 0x0 +#define SE_SEND_CMD_REG_LEN 0x8 +/* Controller command ID */ +#define SE_CMD_START 0x0 +#define SE_CMD_SET_DMA 0x3 +#define SE_CMD_SET_ENGINE_CMDBUF 0x4 + +#define SE_S2LINT_STAT 0x88 +#define SE_S2LINT_EN 0x8c +#define SE_S2LINT_CL 0x94 +#define SE_L2SINT_STAT 0x98 +#define SE_L2SINT_SET 0xa0 + +#define SE_INT_ALL 0xffffffff +#define SE_INT_CONTROLLER BIT(0) + +#define SE_ENGINE_MAX 16 +#define SE_ENGINE_RNG 1 +#define SE_CMD_RNG 0x100 + +#define SE_ENGINE_TPM 5 +#define SE_CMD_TPM 0x500 + +#define SE_ENGINE_CMD_SIZE 32 + +struct loongson_se_engine { + struct loongson_se *se; + int id; + + /* Command buffer */ + void *command; + void *command_ret; + + void *data_buffer; + uint buffer_size; + /* Data buffer offset to DMA base */ + uint buffer_off; + + struct completion completion; + +}; + +struct loongson_se_engine *loongson_se_init_engine(struct device *dev, int id); +int loongson_se_send_engine_cmd(struct loongson_se_engine *engine); + +#endif -- cgit v1.2.3 From 65128868bb3b0621d2d8e71f19852675a064b373 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:04 -0700 Subject: mm/memory_hotplug: Update comment for hotplug memory callback priorities Add clarification to comment for memory hotplug callback ordering as the current comment does not provide clear language on which callback happens first. Acked-by: David Hildenbrand Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- include/linux/memory.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 40eb70ccb09d..1305102688d0 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -115,8 +115,8 @@ struct notifier_block; struct mem_section; /* - * Priorities for the hotplug memory callback routines (stored in decreasing - * order in the callback chain) + * Priorities for the hotplug memory callback routines. Invoked from + * high to low. Higher priorities correspond to higher numbers. */ #define DEFAULT_CALLBACK_PRI 0 #define SLAB_CALLBACK_PRI 1 -- cgit v1.2.3 From b57fc652ca24ada3b0c888327f9944ed21559286 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:05 -0700 Subject: drivers/base/node: Add a helper function node_update_perf_attrs() Add helper function node_update_perf_attrs() to allow update of node access coordinates computed by an external agent such as CXL. The helper allows updating of coordinates after the attribute being created by HMAT. Acked-by: David Hildenbrand Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-3-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/base/node.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/node.h | 8 ++++++++ 2 files changed, 46 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/node.c b/drivers/base/node.c index 3399594136b2..db18a1c32637 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -248,6 +248,44 @@ void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, } EXPORT_SYMBOL_GPL(node_set_perf_attrs); +/** + * node_update_perf_attrs - Update the performance values for given access class + * @nid: Node identifier to be updated + * @coord: Heterogeneous memory performance coordinates + * @access: The access class for the given attributes + */ +void node_update_perf_attrs(unsigned int nid, struct access_coordinate *coord, + enum access_coordinate_class access) +{ + struct node_access_nodes *access_node; + struct node *node; + int i; + + if (WARN_ON_ONCE(!node_online(nid))) + return; + + node = node_devices[nid]; + list_for_each_entry(access_node, &node->access_list, list_node) { + if (access_node->access != access) + continue; + + access_node->coord = *coord; + for (i = 0; access_attrs[i]; i++) { + sysfs_notify(&access_node->dev.kobj, + NULL, access_attrs[i]->name); + } + break; + } + + /* When setting CPU access coordinates, update mempolicy */ + if (access != ACCESS_COORDINATE_CPU) + return; + + if (mempolicy_set_node_perf(nid, coord)) + pr_info("failed to set mempolicy attrs for node %d\n", nid); +} +EXPORT_SYMBOL_GPL(node_update_perf_attrs); + /** * struct node_cache_info - Internal tracking for memory node caches * @dev: Device represeting the cache level diff --git a/include/linux/node.h b/include/linux/node.h index 2c7529335b21..866e3323f1fd 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -85,6 +85,8 @@ struct node_cache_attrs { void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs); void node_set_perf_attrs(unsigned int nid, struct access_coordinate *coord, enum access_coordinate_class access); +void node_update_perf_attrs(unsigned int nid, struct access_coordinate *coord, + enum access_coordinate_class access); #else static inline void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs) @@ -96,6 +98,12 @@ static inline void node_set_perf_attrs(unsigned int nid, enum access_coordinate_class access) { } + +static inline void node_update_perf_attrs(unsigned int nid, + struct access_coordinate *coord, + enum access_coordinate_class access) +{ +} #endif struct node { -- cgit v1.2.3 From 2e454fb8056df6da4bba7d89a57bf60e217463c0 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:06 -0700 Subject: cxl, acpi/hmat: Update CXL access coordinates directly instead of through HMAT The current implementation of CXL memory hotplug notifier gets called before the HMAT memory hotplug notifier. The CXL driver calculates the access coordinates (bandwidth and latency values) for the CXL end to end path (i.e. CPU to endpoint). When the CXL region is onlined, the CXL memory hotplug notifier writes the access coordinates to the HMAT target structs. Then the HMAT memory hotplug notifier is called and it creates the access coordinates for the node sysfs attributes. During testing on an Intel platform, it was found that although the newly calculated coordinates were pushed to sysfs, the sysfs attributes for the access coordinates showed up with the wrong initiator. The system has 4 nodes (0, 1, 2, 3) where node 0 and 1 are CPU nodes and node 2 and 3 are CXL nodes. The expectation is that node 2 would show up as a target to node 0: /sys/devices/system/node/node2/access0/initiators/node0 However it was observed that node 2 showed up as a target under node 1: /sys/devices/system/node/node2/access0/initiators/node1 The original intent of the 'ext_updated' flag in HMAT handling code was to stop HMAT memory hotplug callback from clobbering the access coordinates after CXL has injected its calculated coordinates and replaced the generic target access coordinates provided by the HMAT table in the HMAT target structs. However the flag is hacky at best and blocks the updates from other CXL regions that are onlined in the same node later on. Remove the 'ext_updated' flag usage and just update the access coordinates for the nodes directly without touching HMAT target data. The hotplug memory callback ordering is changed. Instead of changing CXL, move HMAT back so there's room for the levels rather than have CXL share the same level as SLAB_CALLBACK_PRI. The change will resulting in the CXL callback to be executed after the HMAT callback. With the change, the CXL hotplug memory notifier runs after the HMAT callback. The HMAT callback will create the node sysfs attributes for access coordinates. The CXL callback will write the access coordinates to the now created node sysfs attributes directly and will not pollute the HMAT target values. A nodemask is introduced to keep track if a node has been updated and prevents further updates. Fixes: 067353a46d8c ("cxl/region: Add memory hotplug notifier for cxl region") Cc: stable@vger.kernel.org Tested-by: Marc Herbert Reviewed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-4-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/acpi/numa/hmat.c | 6 ------ drivers/cxl/core/cdat.c | 5 ----- drivers/cxl/core/core.h | 1 - drivers/cxl/core/region.c | 20 ++++++++++++-------- include/linux/memory.h | 2 +- 5 files changed, 13 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 4958301f5417..5d32490dc4ab 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -74,7 +74,6 @@ struct memory_target { struct node_cache_attrs cache_attrs; u8 gen_port_device_handle[ACPI_SRAT_DEVICE_HANDLE_SIZE]; bool registered; - bool ext_updated; /* externally updated */ }; struct memory_initiator { @@ -391,7 +390,6 @@ int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, coord->read_bandwidth, access); hmat_update_target_access(target, ACPI_HMAT_WRITE_BANDWIDTH, coord->write_bandwidth, access); - target->ext_updated = true; return 0; } @@ -773,10 +771,6 @@ static void hmat_update_target_attrs(struct memory_target *target, u32 best = 0; int i; - /* Don't update if an external agent has changed the data. */ - if (target->ext_updated) - return; - /* Don't update for generic port if there's no device handle */ if ((access == NODE_ACCESS_CLASS_GENPORT_SINK_LOCAL || access == NODE_ACCESS_CLASS_GENPORT_SINK_CPU) && diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index c0af645425f4..c891fd618cfd 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -1081,8 +1081,3 @@ int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, { return hmat_update_target_coordinates(nid, &cxlr->coord[access], access); } - -bool cxl_need_node_perf_attrs_update(int nid) -{ - return !acpi_node_backed_by_real_pxm(nid); -} diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 2669f251d677..a253d308f3c9 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -139,7 +139,6 @@ long cxl_pci_get_latency(struct pci_dev *pdev); int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, enum access_coordinate_class access); -bool cxl_need_node_perf_attrs_update(int nid); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 71cc42d05248..0ed95cbc5d5b 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -30,6 +30,12 @@ * 3. Decoder targets */ +/* + * nodemask that sets per node when the access_coordinates for the node has + * been updated by the CXL memory hotplug notifier. + */ +static nodemask_t nodemask_region_seen = NODE_MASK_NONE; + static struct cxl_region *to_cxl_region(struct device *dev); #define __ACCESS_ATTR_RO(_level, _name) { \ @@ -2442,14 +2448,8 @@ static bool cxl_region_update_coordinates(struct cxl_region *cxlr, int nid) for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { if (cxlr->coord[i].read_bandwidth) { - rc = 0; - if (cxl_need_node_perf_attrs_update(nid)) - node_set_perf_attrs(nid, &cxlr->coord[i], i); - else - rc = cxl_update_hmat_access_coordinates(nid, cxlr, i); - - if (rc == 0) - cset++; + node_update_perf_attrs(nid, &cxlr->coord[i], i); + cset++; } } @@ -2487,6 +2487,10 @@ static int cxl_region_perf_attrs_callback(struct notifier_block *nb, if (nid != region_nid) return NOTIFY_DONE; + /* No action needed if node bit already set */ + if (node_test_and_set(nid, nodemask_region_seen)) + return NOTIFY_DONE; + if (!cxl_region_update_coordinates(cxlr, nid)) return NOTIFY_DONE; diff --git a/include/linux/memory.h b/include/linux/memory.h index 1305102688d0..0b755d1ef1ec 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -120,8 +120,8 @@ struct mem_section; */ #define DEFAULT_CALLBACK_PRI 0 #define SLAB_CALLBACK_PRI 1 -#define HMAT_CALLBACK_PRI 2 #define CXL_CALLBACK_PRI 5 +#define HMAT_CALLBACK_PRI 6 #define MM_COMPUTE_BATCH_PRI 10 #define CPUSET_CALLBACK_PRI 10 #define MEMTIER_HOTPLUG_PRI 100 -- cgit v1.2.3 From e99ecbc4c89adf551cccbbc00b5cb08c50969af6 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 29 Aug 2025 15:29:07 -0700 Subject: acpi/hmat: Remove now unused hmat_update_target_coordinates() Remove deadcode since CXL no longer calls hmat_update_target_coordinates(). Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250829222907.1290912-5-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/acpi/numa/hmat.c | 28 ---------------------------- drivers/cxl/core/cdat.c | 6 ------ drivers/cxl/core/core.h | 2 -- include/linux/acpi.h | 12 ------------ 4 files changed, 48 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 5d32490dc4ab..5a36d57289b4 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -367,34 +367,6 @@ static void hmat_update_target_access(struct memory_target *target, } } -int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, - enum access_coordinate_class access) -{ - struct memory_target *target; - int pxm; - - if (nid == NUMA_NO_NODE) - return -EINVAL; - - pxm = node_to_pxm(nid); - guard(mutex)(&target_lock); - target = find_mem_target(pxm); - if (!target) - return -ENODEV; - - hmat_update_target_access(target, ACPI_HMAT_READ_LATENCY, - coord->read_latency, access); - hmat_update_target_access(target, ACPI_HMAT_WRITE_LATENCY, - coord->write_latency, access); - hmat_update_target_access(target, ACPI_HMAT_READ_BANDWIDTH, - coord->read_bandwidth, access); - hmat_update_target_access(target, ACPI_HMAT_WRITE_BANDWIDTH, - coord->write_bandwidth, access); - - return 0; -} -EXPORT_SYMBOL_GPL(hmat_update_target_coordinates); - static __init void hmat_add_locality(struct acpi_hmat_locality *hmat_loc) { struct memory_locality *loc; diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index c891fd618cfd..bca1ec279651 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -1075,9 +1075,3 @@ void cxl_region_perf_data_calculate(struct cxl_region *cxlr, cxlr->coord[i].write_bandwidth += perf->coord[i].write_bandwidth; } } - -int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, - enum access_coordinate_class access) -{ - return hmat_update_target_coordinates(nid, &cxlr->coord[access], access); -} diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index a253d308f3c9..0476c3b648de 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -137,8 +137,6 @@ enum cxl_poison_trace_type { long cxl_pci_get_latency(struct pci_dev *pdev); int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); -int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, - enum access_coordinate_class access); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 1c5bb1e887cd..5ff5d99f6ead 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1595,18 +1595,6 @@ static inline void acpi_use_parent_companion(struct device *dev) ACPI_COMPANION_SET(dev, ACPI_COMPANION(dev->parent)); } -#ifdef CONFIG_ACPI_HMAT -int hmat_update_target_coordinates(int nid, struct access_coordinate *coord, - enum access_coordinate_class access); -#else -static inline int hmat_update_target_coordinates(int nid, - struct access_coordinate *coord, - enum access_coordinate_class access) -{ - return -EOPNOTSUPP; -} -#endif - #ifdef CONFIG_ACPI_NUMA bool acpi_node_backed_by_real_pxm(int nid); #else -- cgit v1.2.3 From 21368fcbb124d51b5d8bd8fa0a286a23c34a0888 Mon Sep 17 00:00:00 2001 From: Nicolas Frattaroli Date: Mon, 25 Aug 2025 10:28:21 +0200 Subject: bitmap: introduce hardware-specific bitfield operations Hardware of various vendors, but very notably Rockchip, often uses 32-bit registers where the upper 16-bit half of the register is a write-enable mask for the lower half. This type of hardware setup allows for more granular concurrent register write access. Over the years, many drivers have hand-rolled their own version of this macro, usually without any checks, often called something like HIWORD_UPDATE or FIELD_PREP_HIWORD, commonly with slightly different semantics between them. Clearly there is a demand for such a macro, and thus the demand should be satisfied in a common header file. As this is a convention that spans across multiple vendors, and similar conventions may also have cross-vendor adoption, it's best if it lives in a vendor-agnostic header file that can be expanded over time. Add hw_bitfield.h with two macros: FIELD_PREP_WM16, and FIELD_PREP_WM16_CONST. The latter is a version that can be used in initializers, like FIELD_PREP_CONST. Suggested-by: Yury Norov (NVIDIA) Signed-off-by: Nicolas Frattaroli Acked-by: Jakub Kicinski Acked-by: Heiko Stuebner Signed-off-by: Yury Norov (NVIDIA) --- MAINTAINERS | 1 + include/linux/hw_bitfield.h | 62 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 include/linux/hw_bitfield.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 6dcfbd11efef..4a2db6c4b8ab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4275,6 +4275,7 @@ F: include/linux/bits.h F: include/linux/cpumask.h F: include/linux/cpumask_types.h F: include/linux/find.h +F: include/linux/hw_bitfield.h F: include/linux/nodemask.h F: include/linux/nodemask_types.h F: include/uapi/linux/bits.h diff --git a/include/linux/hw_bitfield.h b/include/linux/hw_bitfield.h new file mode 100644 index 000000000000..df202e167ce4 --- /dev/null +++ b/include/linux/hw_bitfield.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2025, Collabora Ltd. + */ + +#ifndef _LINUX_HW_BITFIELD_H +#define _LINUX_HW_BITFIELD_H + +#include +#include +#include + +/** + * FIELD_PREP_WM16() - prepare a bitfield element with a mask in the upper half + * @_mask: shifted mask defining the field's length and position + * @_val: value to put in the field + * + * FIELD_PREP_WM16() masks and shifts up the value, as well as bitwise ORs the + * result with the mask shifted up by 16. + * + * This is useful for a common design of hardware registers where the upper + * 16-bit half of a 32-bit register is used as a write-enable mask. In such a + * register, a bit in the lower half is only updated if the corresponding bit + * in the upper half is high. + */ +#define FIELD_PREP_WM16(_mask, _val) \ + ({ \ + typeof(_val) __val = _val; \ + typeof(_mask) __mask = _mask; \ + __BF_FIELD_CHECK(__mask, ((u16)0U), __val, \ + "HWORD_UPDATE: "); \ + (((typeof(__mask))(__val) << __bf_shf(__mask)) & (__mask)) | \ + ((__mask) << 16); \ + }) + +/** + * FIELD_PREP_WM16_CONST() - prepare a constant bitfield element with a mask in + * the upper half + * @_mask: shifted mask defining the field's length and position + * @_val: value to put in the field + * + * FIELD_PREP_WM16_CONST() masks and shifts up the value, as well as bitwise ORs + * the result with the mask shifted up by 16. + * + * This is useful for a common design of hardware registers where the upper + * 16-bit half of a 32-bit register is used as a write-enable mask. In such a + * register, a bit in the lower half is only updated if the corresponding bit + * in the upper half is high. + * + * Unlike FIELD_PREP_WM16(), this is a constant expression and can therefore + * be used in initializers. Error checking is less comfortable for this + * version. + */ +#define FIELD_PREP_WM16_CONST(_mask, _val) \ + ( \ + FIELD_PREP_CONST(_mask, _val) | \ + (BUILD_BUG_ON_ZERO(const_true((u64)(_mask) > U16_MAX)) + \ + ((_mask) << 16)) \ + ) + + +#endif /* _LINUX_HW_BITFIELD_H */ -- cgit v1.2.3 From df3a7762ee24ba6a33d4215244e329ca300f4819 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 2 Sep 2025 10:06:56 -0600 Subject: io_uring/uring_cmd: add io_uring_cmd_tw_t type alias Introduce a function pointer type alias io_uring_cmd_tw_t for the uring_cmd task work callback. This avoids repeating the signature in several places. Also name both arguments to the callback to clarify what they represent. Signed-off-by: Caleb Sander Mateos Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20250902160657.1726828-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 13 ++++++++----- io_uring/uring_cmd.c | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 4bd3a7339243..7211157edfe9 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -11,11 +11,14 @@ /* io_uring_cmd is being issued again */ #define IORING_URING_CMD_REISSUE (1U << 31) +typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, + unsigned issue_flags); + struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); + io_uring_cmd_tw_t task_work_cb; u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ @@ -57,7 +60,7 @@ void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, unsigned issue_flags); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_uring_cmd_tw_t task_work_cb, unsigned flags); /* @@ -106,7 +109,7 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_uring_tw_t task_work_cb, unsigned flags) { } @@ -143,13 +146,13 @@ static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) + io_uring_cmd_tw_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) + io_uring_cmd_tw_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index f5a2642bb407..d76d6d27765c 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -126,7 +126,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), + io_uring_cmd_tw_t task_work_cb, unsigned flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); -- cgit v1.2.3 From 04a3134f88a4bd03001a3093144819523cfca99e Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 2 Sep 2025 22:45:24 -0700 Subject: net/mlx5: Add PSP capabilities structures and bits Add mlx5_ifc PSP related capabilities structures and HW definitions needed for PSP support in mlx5. Link: https://lore.kernel.org/netdev/20250828162953.2707727-1-daniel.zahka@gmail.com/ Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 6 ++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 1 + .../mellanox/mlx5/core/steering/hws/definer.c | 2 +- include/linux/mlx5/device.h | 4 + include/linux/mlx5/mlx5_ifc.h | 95 +++++++++++++++++++++- 5 files changed, 103 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 57476487e31f..eeb4437975f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -294,6 +294,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return err; } + if (MLX5_CAP_GEN(dev, psp)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_PSP); + if (err) + return err; + } + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 8517d4e5d5ef..0951c7cc1b5f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1798,6 +1798,7 @@ static const int types[] = { MLX5_CAP_VDPA_EMULATION, MLX5_CAP_IPSEC, MLX5_CAP_PORT_SELECTION, + MLX5_CAP_PSP, MLX5_CAP_MACSEC, MLX5_CAP_ADV_VIRTUALIZATION, MLX5_CAP_CRYPTO, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c index c6436c3a7a83..c4bb6967f74d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c @@ -1280,7 +1280,7 @@ hws_definer_conv_misc2(struct mlx5hws_definer_conv_data *cd, struct mlx5hws_definer_fc *fc = cd->fc; struct mlx5hws_definer_fc *curr_fc; - if (HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1a0, 0x8) || + if (HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.psp_syndrome, 0x8) || HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.ipsec_next_header, 0x8) || HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1c0, 0x40) || diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 9d2467f982ad..72a83666e67f 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1248,6 +1248,7 @@ enum mlx5_cap_type { MLX5_CAP_IPSEC, MLX5_CAP_CRYPTO = 0x1a, MLX5_CAP_SHAMPO = 0x1d, + MLX5_CAP_PSP = 0x1e, MLX5_CAP_MACSEC = 0x1f, MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, @@ -1487,6 +1488,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_SHAMPO(mdev, cap) \ MLX5_GET(shampo_cap, mdev->caps.hca[MLX5_CAP_SHAMPO]->cur, cap) +#define MLX5_CAP_PSP(mdev, cap)\ + MLX5_GET(psp_cap, (mdev)->caps.hca[MLX5_CAP_PSP]->cur, cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 44d497272162..e9f14a0c7f4f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -314,6 +314,8 @@ enum { MLX5_CMD_OP_CREATE_UMEM = 0xa08, MLX5_CMD_OP_DESTROY_UMEM = 0xa0a, MLX5_CMD_OP_SYNC_STEERING = 0xb00, + MLX5_CMD_OP_PSP_GEN_SPI = 0xb10, + MLX5_CMD_OP_PSP_ROTATE_KEY = 0xb11, MLX5_CMD_OP_QUERY_VHCA_STATE = 0xb0d, MLX5_CMD_OP_MODIFY_VHCA_STATE = 0xb0e, MLX5_CMD_OP_SYNC_CRYPTO = 0xb12, @@ -489,12 +491,14 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 execute_aso[0x1]; u8 reserved_at_47[0x19]; - u8 reserved_at_60[0x2]; + u8 reformat_l2_to_l3_psp_tunnel[0x1]; + u8 reformat_l3_psp_tunnel_to_l2[0x1]; u8 reformat_insert[0x1]; u8 reformat_remove[0x1]; u8 macsec_encrypt[0x1]; u8 macsec_decrypt[0x1]; - u8 reserved_at_66[0x2]; + u8 psp_encrypt[0x1]; + u8 psp_decrypt[0x1]; u8 reformat_add_macsec[0x1]; u8 reformat_remove_macsec[0x1]; u8 reparse[0x1]; @@ -703,7 +707,7 @@ struct mlx5_ifc_fte_match_set_misc2_bits { u8 metadata_reg_a[0x20]; - u8 reserved_at_1a0[0x8]; + u8 psp_syndrome[0x8]; u8 macsec_syndrome[0x8]; u8 ipsec_syndrome[0x8]; u8 ipsec_next_header[0x8]; @@ -1511,6 +1515,21 @@ struct mlx5_ifc_macsec_cap_bits { u8 reserved_at_40[0x7c0]; }; +struct mlx5_ifc_psp_cap_bits { + u8 reserved_at_0[0x1]; + u8 psp_crypto_offload[0x1]; + u8 reserved_at_2[0x1]; + u8 psp_crypto_esp_aes_gcm_256_encrypt[0x1]; + u8 psp_crypto_esp_aes_gcm_128_encrypt[0x1]; + u8 psp_crypto_esp_aes_gcm_256_decrypt[0x1]; + u8 psp_crypto_esp_aes_gcm_128_decrypt[0x1]; + u8 reserved_at_7[0x4]; + u8 log_max_num_of_psp_spi[0x5]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x7e0]; +}; + enum { MLX5_WQ_TYPE_LINKED_LIST = 0x0, MLX5_WQ_TYPE_CYCLIC = 0x1, @@ -1876,7 +1895,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_2a0[0x7]; u8 mkey_pcie_tph[0x1]; - u8 reserved_at_2a8[0x3]; + u8 reserved_at_2a8[0x2]; + + u8 psp[0x1]; u8 shampo[0x1]; u8 reserved_at_2ac[0x4]; u8 max_wqe_sz_rq[0x10]; @@ -3803,6 +3824,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_macsec_cap_bits macsec_cap; struct mlx5_ifc_crypto_cap_bits crypto_cap; struct mlx5_ifc_ipsec_cap_bits ipsec_cap; + struct mlx5_ifc_psp_cap_bits psp_cap; u8 reserved_at_0[0x8000]; }; @@ -3832,6 +3854,7 @@ enum { enum { MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC = 0x0, MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC = 0x1, + MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_PSP = 0x2, }; struct mlx5_ifc_vlan_bits { @@ -7159,6 +7182,8 @@ enum mlx5_reformat_ctx_type { MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT_OVER_UDP = 0xa, MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb, MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_UDPV6 = 0xc, + MLX5_REFORMAT_TYPE_ADD_PSP_TUNNEL = 0xd, + MLX5_REFORMAT_TYPE_DEL_PSP_TUNNEL = 0xe, MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf, MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10, MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11, @@ -7285,6 +7310,7 @@ enum { MLX5_ACTION_IN_FIELD_IPSEC_SYNDROME = 0x5D, MLX5_ACTION_IN_FIELD_OUT_EMD_47_32 = 0x6F, MLX5_ACTION_IN_FIELD_OUT_EMD_31_0 = 0x70, + MLX5_ACTION_IN_FIELD_PSP_SYNDROME = 0x71, }; struct mlx5_ifc_alloc_modify_header_context_out_bits { @@ -13079,6 +13105,7 @@ enum { MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_TLS = 0x1, MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_IPSEC = 0x2, MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_MACSEC = 0x4, + MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_PSP = 0x6, }; struct mlx5_ifc_tls_static_params_bits { @@ -13496,4 +13523,64 @@ enum mlx5e_pcie_cong_event_mod_field { MLX5_PCIE_CONG_EVENT_MOD_THRESH = BIT(2), }; +struct mlx5_ifc_psp_rotate_key_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_psp_rotate_key_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +enum mlx5_psp_gen_spi_in_key_size { + MLX5_PSP_GEN_SPI_IN_KEY_SIZE_128 = 0x0, + MLX5_PSP_GEN_SPI_IN_KEY_SIZE_256 = 0x1, +}; + +struct mlx5_ifc_key_spi_bits { + u8 spi[0x20]; + + u8 reserved_at_20[0x60]; + + u8 key[8][0x20]; +}; + +struct mlx5_ifc_psp_gen_spi_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + + u8 key_size[0x2]; + u8 reserved_at_62[0xe]; + u8 num_of_spi[0x10]; +}; + +struct mlx5_ifc_psp_gen_spi_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x10]; + u8 num_of_spi[0x10]; + + u8 reserved_at_60[0x20]; + + struct mlx5_ifc_key_spi_bits key_spi[]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From ddeb66d2cb10f03a43d97a0ff2c3869d1951c87d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 26 Aug 2025 11:54:36 +0200 Subject: gpio: nomadik: don't print out global GPIO numbers in debugfs callbacks In order to further limit the number of references to the GPIO base number stored in struct gpio_chip, replace the global GPIO numbers in the output of debugfs callbacks by hardware offsets. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250826-gpio-dbg-show-base-v1-2-7f27cd7f2256@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-nomadik.c | 25 ++++++++++++------------- drivers/pinctrl/nomadik/pinctrl-nomadik.c | 2 +- include/linux/gpio/gpio-nomadik.h | 3 +-- 3 files changed, 14 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-nomadik.c b/drivers/gpio/gpio-nomadik.c index bcf4b07dd458..fde4b416faa8 100644 --- a/drivers/gpio/gpio-nomadik.c +++ b/drivers/gpio/gpio-nomadik.c @@ -20,6 +20,7 @@ */ #include #include +#include #include #include #include @@ -396,10 +397,10 @@ static int nmk_gpio_get_mode(struct nmk_gpio_chip *nmk_chip, int offset) } void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, - struct gpio_chip *chip, unsigned int offset, - unsigned int gpio) + struct gpio_chip *chip, unsigned int offset) { struct nmk_gpio_chip *nmk_chip = gpiochip_get_data(chip); + struct gpio_desc *desc; int mode; bool is_out; bool data_out; @@ -425,15 +426,15 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, data_out = !!(readl(nmk_chip->addr + NMK_GPIO_DAT) & BIT(offset)); mode = nmk_gpio_get_mode(nmk_chip, offset); #ifdef CONFIG_PINCTRL_NOMADIK - if (mode == NMK_GPIO_ALT_C && pctldev) - mode = nmk_prcm_gpiocr_get_mode(pctldev, gpio); + if (mode == NMK_GPIO_ALT_C && pctldev) { + desc = gpio_device_get_desc(chip->gpiodev, offset); + mode = nmk_prcm_gpiocr_get_mode(pctldev, desc_to_gpio(desc)); + } #endif if (is_out) { seq_printf(s, " gpio-%-3d (%-20.20s) out %s %s", - gpio, - label ?: "(none)", - str_hi_lo(data_out), + offset, label ?: "(none)", str_hi_lo(data_out), (mode < 0) ? "unknown" : modes[mode]); } else { int irq = chip->to_irq(chip, offset); @@ -445,9 +446,7 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, }; seq_printf(s, " gpio-%-3d (%-20.20s) in %s %s", - gpio, - label ?: "(none)", - pulls[pullidx], + offset, label ?: "(none)", pulls[pullidx], (mode < 0) ? "unknown" : modes[mode]); val = nmk_gpio_get_input(chip, offset); @@ -479,10 +478,10 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, static void nmk_gpio_dbg_show(struct seq_file *s, struct gpio_chip *chip) { - unsigned int i, gpio = chip->base; + unsigned int i; - for (i = 0; i < chip->ngpio; i++, gpio++) { - nmk_gpio_dbg_show_one(s, NULL, chip, i, gpio); + for (i = 0; i < chip->ngpio; i++) { + nmk_gpio_dbg_show_one(s, NULL, chip, i); seq_puts(s, "\n"); } } diff --git a/drivers/pinctrl/nomadik/pinctrl-nomadik.c b/drivers/pinctrl/nomadik/pinctrl-nomadik.c index 8940e04fcf4c..db0311b14132 100644 --- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c +++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c @@ -584,7 +584,7 @@ static void nmk_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, seq_printf(s, "invalid pin offset"); return; } - nmk_gpio_dbg_show_one(s, pctldev, chip, offset - chip->base, offset); + nmk_gpio_dbg_show_one(s, pctldev, chip, offset - chip->base); } static int nmk_dt_add_map_mux(struct pinctrl_map **map, unsigned int *reserved_maps, diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index b5a84864650d..7ba53b499e16 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -261,8 +261,7 @@ struct platform_device; * true. */ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, - struct gpio_chip *chip, unsigned int offset, - unsigned int gpio); + struct gpio_chip *chip, unsigned int offset); #else -- cgit v1.2.3 From 661f951e371cc134ea31c84238dbdc9a898b8403 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Aug 2025 12:02:44 +0000 Subject: sched/fair: Get rid of sched_domains_curr_level hack for tl->cpumask() Leon [1] and Vinicius [2] noted a topology_span_sane() warning during their testing starting from v6.16-rc1. Debug that followed pointed to the tl->mask() for the NODE domain being incorrectly resolved to that of the highest NUMA domain. tl->mask() for NODE is set to the sd_numa_mask() which depends on the global "sched_domains_curr_level" hack. "sched_domains_curr_level" is set to the "tl->numa_level" during tl traversal in build_sched_domains() calling sd_init() but was not reset before topology_span_sane(). Since "tl->numa_level" still reflected the old value from build_sched_domains(), topology_span_sane() for the NODE domain trips when the span of the last NUMA domain overlaps. Instead of replicating the "sched_domains_curr_level" hack, get rid of it entirely and instead, pass the entire "sched_domain_topology_level" object to tl->cpumask() function to prevent such mishap in the future. sd_numa_mask() now directly references "tl->numa_level" instead of relying on the global "sched_domains_curr_level" hack to index into sched_domains_numa_masks[]. The original warning was reproducible on the following NUMA topology reported by Leon: $ sudo numactl -H available: 5 nodes (0-4) node 0 cpus: 0 1 node 0 size: 2927 MB node 0 free: 1603 MB node 1 cpus: 2 3 node 1 size: 3023 MB node 1 free: 3008 MB node 2 cpus: 4 5 node 2 size: 3023 MB node 2 free: 3007 MB node 3 cpus: 6 7 node 3 size: 3023 MB node 3 free: 3002 MB node 4 cpus: 8 9 node 4 size: 3022 MB node 4 free: 2718 MB node distances: node 0 1 2 3 4 0: 10 39 38 37 36 1: 39 10 38 37 36 2: 38 38 10 37 36 3: 37 37 37 10 36 4: 36 36 36 36 10 The above topology can be mimicked using the following QEMU cmd that was used to reproduce the warning and test the fix: sudo qemu-system-x86_64 -enable-kvm -cpu host \ -m 20G -smp cpus=10,sockets=10 -machine q35 \ -object memory-backend-ram,size=4G,id=m0 \ -object memory-backend-ram,size=4G,id=m1 \ -object memory-backend-ram,size=4G,id=m2 \ -object memory-backend-ram,size=4G,id=m3 \ -object memory-backend-ram,size=4G,id=m4 \ -numa node,cpus=0-1,memdev=m0,nodeid=0 \ -numa node,cpus=2-3,memdev=m1,nodeid=1 \ -numa node,cpus=4-5,memdev=m2,nodeid=2 \ -numa node,cpus=6-7,memdev=m3,nodeid=3 \ -numa node,cpus=8-9,memdev=m4,nodeid=4 \ -numa dist,src=0,dst=1,val=39 \ -numa dist,src=0,dst=2,val=38 \ -numa dist,src=0,dst=3,val=37 \ -numa dist,src=0,dst=4,val=36 \ -numa dist,src=1,dst=0,val=39 \ -numa dist,src=1,dst=2,val=38 \ -numa dist,src=1,dst=3,val=37 \ -numa dist,src=1,dst=4,val=36 \ -numa dist,src=2,dst=0,val=38 \ -numa dist,src=2,dst=1,val=38 \ -numa dist,src=2,dst=3,val=37 \ -numa dist,src=2,dst=4,val=36 \ -numa dist,src=3,dst=0,val=37 \ -numa dist,src=3,dst=1,val=37 \ -numa dist,src=3,dst=2,val=37 \ -numa dist,src=3,dst=4,val=36 \ -numa dist,src=4,dst=0,val=36 \ -numa dist,src=4,dst=1,val=36 \ -numa dist,src=4,dst=2,val=36 \ -numa dist,src=4,dst=3,val=36 \ ... [ prateek: Moved common functions to include/linux/sched/topology.h, reuse the common bits for s390 and ppc, commit message ] Closes: https://lore.kernel.org/lkml/20250610110701.GA256154@unreal/ [1] Fixes: ccf74128d66c ("sched/topology: Assert non-NUMA topology masks don't (partially) overlap") # ce29a7da84cd, f55dac1dafb3 Signed-off-by: Peter Zijlstra (Intel) Reported-by: Leon Romanovsky Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Shrikanth Hegde Tested-by: Valentin Schneider # x86 Tested-by: Shrikanth Hegde # powerpc Link: https://lore.kernel.org/lkml/a3de98387abad28592e6ab591f3ff6107fe01dc1.1755893468.git.tim.c.chen@linux.intel.com/ [2] --- arch/powerpc/Kconfig | 4 ++++ arch/powerpc/include/asm/topology.h | 2 ++ arch/powerpc/kernel/smp.c | 27 +++++++++++---------------- arch/s390/kernel/topology.c | 20 +++++++------------- arch/x86/kernel/smpboot.c | 8 ++++---- include/linux/sched/topology.h | 28 +++++++++++++++++++++++++++- include/linux/topology.h | 2 +- kernel/sched/topology.c | 28 ++++++++++------------------ 8 files changed, 66 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 93402a1d9c9f..e51a595a0622 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -971,6 +971,10 @@ config SCHED_SMT when dealing with POWER5 cpus at a cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_MC + def_bool y + depends on SMP + config PPC_DENORMALISATION bool "PowerPC denormalisation exception handling" depends on PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index da15b5efe807..f19ca44512d1 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -131,6 +131,8 @@ static inline int cpu_to_coregroup_id(int cpu) #ifdef CONFIG_SMP #include +struct cpumask *cpu_coregroup_mask(int cpu); + #ifdef CONFIG_PPC64 #include diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f59e4b9cc207..68edb66c2964 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1028,19 +1028,19 @@ static int powerpc_shared_proc_flags(void) * We can't just pass cpu_l2_cache_mask() directly because * returns a non-const pointer and the compiler barfs on that. */ -static const struct cpumask *shared_cache_mask(int cpu) +static const struct cpumask *tl_cache_mask(struct sched_domain_topology_level *tl, int cpu) { return per_cpu(cpu_l2_cache_map, cpu); } #ifdef CONFIG_SCHED_SMT -static const struct cpumask *smallcore_smt_mask(int cpu) +static const struct cpumask *tl_smallcore_smt_mask(struct sched_domain_topology_level *tl, int cpu) { return cpu_smallcore_mask(cpu); } #endif -static struct cpumask *cpu_coregroup_mask(int cpu) +struct cpumask *cpu_coregroup_mask(int cpu) { return per_cpu(cpu_coregroup_map, cpu); } @@ -1054,11 +1054,6 @@ static bool has_coregroup_support(void) return coregroup_enabled; } -static const struct cpumask *cpu_mc_mask(int cpu) -{ - return cpu_coregroup_mask(cpu); -} - static int __init init_big_cores(void) { int cpu; @@ -1448,7 +1443,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask) return false; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update l2-cache mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_l2_cache_mask); @@ -1538,7 +1533,7 @@ static void update_coregroup_mask(int cpu, cpumask_var_t *mask) return; } - cpumask_and(*mask, cpu_online_mask, cpu_cpu_mask(cpu)); + cpumask_and(*mask, cpu_online_mask, cpu_node_mask(cpu)); /* Update coregroup mask with all the CPUs that are part of submask */ or_cpumasks_related(cpu, cpu, submask_fn, cpu_coregroup_mask); @@ -1601,7 +1596,7 @@ static void add_cpu_to_masks(int cpu) /* If chip_id is -1; limit the cpu_core_mask to within PKG */ if (chip_id == -1) - cpumask_and(mask, mask, cpu_cpu_mask(cpu)); + cpumask_and(mask, mask, cpu_node_mask(cpu)); for_each_cpu(i, mask) { if (chip_id == cpu_to_chip_id(i)) { @@ -1701,22 +1696,22 @@ static void __init build_sched_topology(void) if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); powerpc_topology[i++] = - SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); + SDTL_INIT(tl_smallcore_smt_mask, powerpc_smt_flags, SMT); } else { - powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); + powerpc_topology[i++] = SDTL_INIT(tl_smt_mask, powerpc_smt_flags, SMT); } #endif if (shared_caches) { powerpc_topology[i++] = - SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); + SDTL_INIT(tl_cache_mask, powerpc_shared_cache_flags, CACHE); } if (has_coregroup_support()) { powerpc_topology[i++] = - SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); + SDTL_INIT(tl_mc_mask, powerpc_shared_proc_flags, MC); } - powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); + powerpc_topology[i++] = SDTL_INIT(tl_pkg_mask, powerpc_shared_proc_flags, PKG); /* There must be one trailing NULL entry left. */ BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 46569b8e47dd..1594c80e9bc4 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -509,33 +509,27 @@ int topology_cpu_init(struct cpu *cpu) return rc; } -static const struct cpumask *cpu_thread_mask(int cpu) -{ - return &cpu_topology[cpu].thread_mask; -} - - const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_mask; } -static const struct cpumask *cpu_book_mask(int cpu) +static const struct cpumask *tl_book_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].book_mask; } -static const struct cpumask *cpu_drawer_mask(int cpu) +static const struct cpumask *tl_drawer_mask(struct sched_domain_topology_level *tl, int cpu) { return &cpu_topology[cpu].drawer_mask; } static struct sched_domain_topology_level s390_topology[] = { - SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), - SDTL_INIT(cpu_book_mask, NULL, BOOK), - SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), + SDTL_INIT(tl_book_mask, NULL, BOOK), + SDTL_INIT(tl_drawer_mask, NULL, DRAWER), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 33e166f6ab12..eb289abece23 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -479,14 +479,14 @@ static int x86_cluster_flags(void) static bool x86_has_numa_in_package; static struct sched_domain_topology_level x86_topology[] = { - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, x86_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), + SDTL_INIT(tl_mc_mask, x86_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + SDTL_INIT(tl_pkg_mask, x86_sched_itmt_flags, PKG), { NULL }, }; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 5263746b63e8..a3a24e115d44 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -30,11 +30,19 @@ struct sd_flag_debug { }; extern const struct sd_flag_debug sd_flag_debug[]; +struct sched_domain_topology_level; + #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; } + +static inline const +struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_smt_mask(cpu); +} #endif #ifdef CONFIG_SCHED_CLUSTER @@ -42,6 +50,12 @@ static inline int cpu_cluster_flags(void) { return SD_CLUSTER | SD_SHARE_LLC; } + +static inline const +struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_clustergroup_mask(cpu); +} #endif #ifdef CONFIG_SCHED_MC @@ -49,8 +63,20 @@ static inline int cpu_core_flags(void) { return SD_SHARE_LLC; } + +static inline const +struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_coregroup_mask(cpu); +} #endif +static inline const +struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_node_mask(cpu); +} + #ifdef CONFIG_NUMA static inline int cpu_numa_flags(void) { @@ -172,7 +198,7 @@ bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); bool cpus_share_resources(int this_cpu, int that_cpu); -typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(struct sched_domain_topology_level *tl, int cpu); typedef int (*sched_domain_flags_f)(void); struct sd_data { diff --git a/include/linux/topology.h b/include/linux/topology.h index 33b7fda97d39..6575af39fd10 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -260,7 +260,7 @@ static inline bool topology_is_primary_thread(unsigned int cpu) #endif -static inline const struct cpumask *cpu_cpu_mask(int cpu) +static inline const struct cpumask *cpu_node_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); } diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 977e133bb8a4..18889bd97e22 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1591,7 +1591,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd) enum numa_topology_type sched_numa_topology_type; static int sched_domains_numa_levels; -static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; @@ -1632,14 +1631,7 @@ sd_init(struct sched_domain_topology_level *tl, int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; -#ifdef CONFIG_NUMA - /* - * Ugly hack to pass state to sd_numa_mask()... - */ - sched_domains_curr_level = tl->numa_level; -#endif - - sd_weight = cpumask_weight(tl->mask(cpu)); + sd_weight = cpumask_weight(tl->mask(tl, cpu)); if (tl->sd_flags) sd_flags = (*tl->sd_flags)(); @@ -1677,7 +1669,7 @@ sd_init(struct sched_domain_topology_level *tl, }; sd_span = sched_domain_span(sd); - cpumask_and(sd_span, cpu_map, tl->mask(cpu)); + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); sd_id = cpumask_first(sd_span); sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); @@ -1737,17 +1729,17 @@ sd_init(struct sched_domain_topology_level *tl, */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), + SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), + SDTL_INIT(tl_cls_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), + SDTL_INIT(tl_mc_mask, cpu_core_flags, MC), #endif - SDTL_INIT(cpu_cpu_mask, NULL, PKG), + SDTL_INIT(tl_pkg_mask, NULL, PKG), { NULL, }, }; @@ -1769,9 +1761,9 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl) #ifdef CONFIG_NUMA -static const struct cpumask *sd_numa_mask(int cpu) +static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) { - return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; + return sched_domains_numa_masks[tl->numa_level][cpu_to_node(cpu)]; } static void sched_numa_warn(const char *str) @@ -2411,7 +2403,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) * breaks the linking done for an earlier span. */ for_each_cpu(cpu, cpu_map) { - const struct cpumask *tl_cpu_mask = tl->mask(cpu); + const struct cpumask *tl_cpu_mask = tl->mask(tl, cpu); int id; /* lowest bit set in this mask is used as a unique id */ @@ -2419,7 +2411,7 @@ static bool topology_span_sane(const struct cpumask *cpu_map) if (cpumask_test_cpu(id, id_seen)) { /* First CPU has already been seen, ensure identical spans */ - if (!cpumask_equal(tl->mask(id), tl_cpu_mask)) + if (!cpumask_equal(tl->mask(tl, id), tl_cpu_mask)) return false; } else { /* First CPU hasn't been seen before, ensure it's a completely new span */ -- cgit v1.2.3 From 91c614f09abf1d45aac6b475d82a36c704b527ee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Aug 2025 10:55:55 +0200 Subject: sched: Move STDL_INIT() functions out-of-line Since all these functions are address-taken in SDTL_INIT() and called indirectly, it doesn't really make sense for them to be inline. Suggested-by: Christophe Leroy Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched/topology.h | 49 ++++++------------------------------------ kernel/sched/topology.c | 45 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a3a24e115d44..bbcfdf12aa6e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -33,56 +33,21 @@ extern const struct sd_flag_debug sd_flag_debug[]; struct sched_domain_topology_level; #ifdef CONFIG_SCHED_SMT -static inline int cpu_smt_flags(void) -{ - return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; -} - -static inline const -struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_smt_mask(cpu); -} +extern int cpu_smt_flags(void); +extern const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_CLUSTER -static inline int cpu_cluster_flags(void) -{ - return SD_CLUSTER | SD_SHARE_LLC; -} - -static inline const -struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_clustergroup_mask(cpu); -} +extern int cpu_cluster_flags(void); +extern const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu); #endif #ifdef CONFIG_SCHED_MC -static inline int cpu_core_flags(void) -{ - return SD_SHARE_LLC; -} - -static inline const -struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_coregroup_mask(cpu); -} +extern int cpu_core_flags(void); +extern const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu); #endif -static inline const -struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) -{ - return cpu_node_mask(cpu); -} - -#ifdef CONFIG_NUMA -static inline int cpu_numa_flags(void) -{ - return SD_NUMA; -} -#endif +extern const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu); extern int arch_asym_cpu_priority(int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 18889bd97e22..1104d931c015 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1724,6 +1724,47 @@ sd_init(struct sched_domain_topology_level *tl, return sd; } +#ifdef CONFIG_SCHED_SMT +int cpu_smt_flags(void) +{ + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; +} + +const struct cpumask *tl_smt_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_smt_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_CLUSTER +int cpu_cluster_flags(void) +{ + return SD_CLUSTER | SD_SHARE_LLC; +} + +const struct cpumask *tl_cls_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_clustergroup_mask(cpu); +} +#endif + +#ifdef CONFIG_SCHED_MC +int cpu_core_flags(void) +{ + return SD_SHARE_LLC; +} + +const struct cpumask *tl_mc_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_coregroup_mask(cpu); +} +#endif + +const struct cpumask *tl_pkg_mask(struct sched_domain_topology_level *tl, int cpu) +{ + return cpu_node_mask(cpu); +} + /* * Topology list, bottom-up. */ @@ -1760,6 +1801,10 @@ void __init set_sched_topology(struct sched_domain_topology_level *tl) } #ifdef CONFIG_NUMA +static int cpu_numa_flags(void) +{ + return SD_NUMA; +} static const struct cpumask *sd_numa_mask(struct sched_domain_topology_level *tl, int cpu) { -- cgit v1.2.3 From 2cd571245b43492867bf1b4252485f3e6647b643 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 29 Aug 2025 16:11:16 +0800 Subject: sched/fair: Add related data structure for task based throttle Add related data structures for this new throttle functionality. Tesed-by: K Prateek Nayak Signed-off-by: Valentin Schneider Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chengming Zhou Tested-by: Valentin Schneider Tested-by: Matteo Martelli Link: https://lore.kernel.org/r/20250829081120.806-2-ziqianlu@bytedance.com --- include/linux/sched.h | 5 +++++ kernel/sched/core.c | 3 +++ kernel/sched/fair.c | 13 +++++++++++++ kernel/sched/sched.h | 3 +++ 4 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f8188b833350..644a01bdae70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -883,6 +883,11 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; +#ifdef CONFIG_CFS_BANDWIDTH + struct callback_head sched_throttle_work; + struct list_head throttle_node; + bool throttled; +#endif #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..feb750aae71b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4490,6 +4490,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; +#ifdef CONFIG_CFS_BANDWIDTH + init_cfs_throttle_work(p); +#endif #endif #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..8fff40fcbc42 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5748,6 +5748,18 @@ static inline int throttled_lb_pair(struct task_group *tg, throttled_hierarchy(dest_cfs_rq); } +static void throttle_cfs_rq_work(struct callback_head *work) +{ +} + +void init_cfs_throttle_work(struct task_struct *p) +{ + init_task_work(&p->sched_throttle_work, throttle_cfs_rq_work); + /* Protect against double add, see throttle_cfs_rq() and throttle_cfs_rq_work() */ + p->sched_throttle_work.next = &p->sched_throttle_work; + INIT_LIST_HEAD(&p->throttle_node); +} + static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; @@ -6472,6 +6484,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); + INIT_LIST_HEAD(&cfs_rq->throttled_limbo_list); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f7..a6493d255397 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -739,6 +739,7 @@ struct cfs_rq { int throttle_count; struct list_head throttled_list; struct list_head throttled_csd_list; + struct list_head throttled_limbo_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -2658,6 +2659,8 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_entity(struct sched_dl_entity *dl_se); +extern void init_cfs_throttle_work(struct task_struct *p); + #define BW_SHIFT 20 #define BW_UNIT (1 << BW_SHIFT) #define RATIO_SHIFT 8 -- cgit v1.2.3 From 74e2ef72bd4b25ce21c8f309d4f5b91b5df9ff5b Mon Sep 17 00:00:00 2001 From: Gokul Sivakumar Date: Thu, 24 Jul 2025 15:41:36 +0530 Subject: wifi: brcmfmac: fix 43752 SDIO FWVID incorrectly labelled as Cypress (CYW) Cypress(Infineon) is not the vendor for this 43752 SDIO WLAN chip, and so has not officially released any firmware binary for it. It is incorrect to maintain this WLAN chip with firmware vendor ID as "CYW". So relabel the chip's firmware Vendor ID as "WCC" as suggested by the maintainer. Fixes: d2587c57ffd8 ("brcmfmac: add 43752 SDIO ids and initialization") Fixes: f74f1ec22dc2 ("wifi: brcmfmac: add support for Cypress firmware api") Signed-off-by: Gokul Sivakumar Acked-by: Arend van Spriel Link: https://patch.msgid.link/20250724101136.6691-1-gokulkumar.sivakumar@infineon.com Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c | 2 +- drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c | 4 ++-- drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c | 8 ++++---- drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h | 1 - include/linux/mmc/sdio_ids.h | 2 +- 5 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c index 8ab7d1e34a6e..6a3f187320fc 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/bcmsdh.c @@ -997,9 +997,9 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = { BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4356, WCC), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_4359, WCC), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43751, WCC), + BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_43752, WCC), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373, CYW), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43012, CYW), - BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752, CYW), BRCMF_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_89359, CYW), CYW_SDIO_DEVICE(SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439, CYW), { /* end: all zeroes */ } diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c index 9074ab49e806..4239f2b21e54 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/chip.c @@ -738,8 +738,8 @@ static u32 brcmf_chip_tcm_rambase(struct brcmf_chip_priv *ci) case BRCM_CC_4364_CHIP_ID: case CY_CC_4373_CHIP_ID: return 0x160000; - case CY_CC_43752_CHIP_ID: case BRCM_CC_43751_CHIP_ID: + case BRCM_CC_43752_CHIP_ID: case BRCM_CC_4377_CHIP_ID: return 0x170000; case BRCM_CC_4378_CHIP_ID: @@ -1452,7 +1452,7 @@ bool brcmf_chip_sr_capable(struct brcmf_chip *pub) return (reg & CC_SR_CTL0_ENABLE_MASK) != 0; case BRCM_CC_4359_CHIP_ID: case BRCM_CC_43751_CHIP_ID: - case CY_CC_43752_CHIP_ID: + case BRCM_CC_43752_CHIP_ID: case CY_CC_43012_CHIP_ID: addr = CORE_CC_REG(pmu->base, retention_ctl); reg = chip->ops->read32(chip->ctx, addr); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c index 8a0bad5119a0..8cf9d7e7c3f7 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c @@ -655,10 +655,10 @@ static const struct brcmf_firmware_mapping brcmf_sdio_fwnames[] = { BRCMF_FW_ENTRY(BRCM_CC_4356_CHIP_ID, 0xFFFFFFFF, 4356), BRCMF_FW_ENTRY(BRCM_CC_4359_CHIP_ID, 0xFFFFFFFF, 4359), BRCMF_FW_ENTRY(BRCM_CC_43751_CHIP_ID, 0xFFFFFFFF, 43752), + BRCMF_FW_ENTRY(BRCM_CC_43752_CHIP_ID, 0xFFFFFFFF, 43752), BRCMF_FW_ENTRY(CY_CC_4373_CHIP_ID, 0xFFFFFFFF, 4373), BRCMF_FW_ENTRY(CY_CC_43012_CHIP_ID, 0xFFFFFFFF, 43012), BRCMF_FW_ENTRY(CY_CC_43439_CHIP_ID, 0xFFFFFFFF, 43439), - BRCMF_FW_ENTRY(CY_CC_43752_CHIP_ID, 0xFFFFFFFF, 43752) }; #define TXCTL_CREDITS 2 @@ -3426,8 +3426,8 @@ err: static bool brcmf_sdio_aos_no_decode(struct brcmf_sdio *bus) { if (bus->ci->chip == BRCM_CC_43751_CHIP_ID || - bus->ci->chip == CY_CC_43012_CHIP_ID || - bus->ci->chip == CY_CC_43752_CHIP_ID) + bus->ci->chip == BRCM_CC_43752_CHIP_ID || + bus->ci->chip == CY_CC_43012_CHIP_ID) return true; else return false; @@ -4278,8 +4278,8 @@ static void brcmf_sdio_firmware_callback(struct device *dev, int err, switch (sdiod->func1->device) { case SDIO_DEVICE_ID_BROADCOM_43751: + case SDIO_DEVICE_ID_BROADCOM_43752: case SDIO_DEVICE_ID_BROADCOM_CYPRESS_4373: - case SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752: brcmf_dbg(INFO, "set F2 watermark to 0x%x*4 bytes\n", CY_4373_F2_WATERMARK); brcmf_sdiod_writeb(sdiod, SBSDIO_WATERMARK, diff --git a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h index b39c5c1ee18b..df3b67ba4db2 100644 --- a/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h +++ b/drivers/net/wireless/broadcom/brcm80211/include/brcm_hw_ids.h @@ -60,7 +60,6 @@ #define CY_CC_4373_CHIP_ID 0x4373 #define CY_CC_43012_CHIP_ID 43012 #define CY_CC_43439_CHIP_ID 43439 -#define CY_CC_43752_CHIP_ID 43752 /* USB Device IDs */ #define BRCM_USB_43143_DEVICE_ID 0xbd1e diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index fe3d6d98f8da..673cbdf43453 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -77,7 +77,7 @@ #define SDIO_DEVICE_ID_BROADCOM_43439 0xa9af #define SDIO_DEVICE_ID_BROADCOM_43455 0xa9bf #define SDIO_DEVICE_ID_BROADCOM_43751 0xaae7 -#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752 0xaae8 +#define SDIO_DEVICE_ID_BROADCOM_43752 0xaae8 #define SDIO_VENDOR_ID_CYPRESS 0x04b4 #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439 0xbd3d -- cgit v1.2.3 From 0a26e5eb78fb1627beb8e3eb172737f8492d2799 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 25 Aug 2025 15:34:23 -0500 Subject: jiffies: Remove obsolete SHIFTED_HZ comment b3c869d35b9b ("jiffies: Remove compile time assumptions about CLOCK_TICK_RATE") removed the last definition of SHIFTED_HZ but left behind comments about it. Remove the comments as well. Signed-off-by: Bjorn Helgaas Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250825203425.796034-1-helgaas@kernel.org --- include/linux/jiffies.h | 2 +- include/vdso/jiffies.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 91b20788273d..0d1927da8055 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -61,7 +61,7 @@ extern void register_refined_jiffies(long clock_tick_rate); -/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ +/* TICK_USEC is the time between ticks in usec */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ diff --git a/include/vdso/jiffies.h b/include/vdso/jiffies.h index 2f9d596c8b29..8ca04a141412 100644 --- a/include/vdso/jiffies.h +++ b/include/vdso/jiffies.h @@ -5,7 +5,7 @@ #include /* for HZ */ #include -/* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */ +/* TICK_NSEC is the time between ticks in nsec */ #define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ) #endif /* __VDSO_JIFFIES_H */ -- cgit v1.2.3 From a576a849d5f33356e0d8fd3eae4fbaf8869417e5 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 5 Aug 2025 15:34:43 +0200 Subject: of/irq: Convert of_msi_map_id() callers to of_msi_xlate() With the introduction of the of_msi_xlate() function, the OF layer provides an API to map a device ID and retrieve the MSI controller node the ID is mapped to with a single call. of_msi_map_id() is currently used to map a deviceID to a specific MSI controller node; of_msi_xlate() can be used for that purpose too, there is no need to keep the two functions. Convert of_msi_map_id() to of_msi_xlate() calls and update the of_msi_xlate() documentation to describe how the struct device_node pointer passed in should be set-up to either provide the MSI controller node target or receive its pointer upon mapping completion. Signed-off-by: Lorenzo Pieralisi Cc: Thomas Gleixner Cc: Rob Herring Cc: Marc Zyngier Acked-by: Thomas Gleixner Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250805133443.936955-1-lpieralisi@kernel.org Signed-off-by: Rob Herring (Arm) --- drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c | 2 +- drivers/of/irq.c | 25 +++++-------------------- drivers/pci/msi/irqdomain.c | 2 +- include/linux/of_irq.h | 6 ------ 4 files changed, 7 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c index 11549d85f23b..b5785472765a 100644 --- a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c @@ -30,7 +30,7 @@ static u32 fsl_mc_msi_domain_get_msi_id(struct irq_domain *domain, u32 out_id; of_node = irq_domain_get_of_node(domain); - out_id = of_node ? of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid) : + out_id = of_node ? of_msi_xlate(&mc_dev->dev, &of_node, mc_dev->icid) : iort_msi_map_id(&mc_dev->dev, mc_dev->icid); return out_id; diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 74aaea61de13..e7c12abd10ab 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -673,13 +673,14 @@ err: /** * of_msi_xlate - map a MSI ID and find relevant MSI controller node * @dev: device for which the mapping is to be done. - * @msi_np: Pointer to store the MSI controller node + * @msi_np: Pointer to target MSI controller node * @id_in: Device ID. * * Walk up the device hierarchy looking for devices with a "msi-map" - * property. If found, apply the mapping to @id_in. @msi_np pointed - * value must be NULL on entry, if an MSI controller is found @msi_np is - * initialized to the MSI controller node with a reference held. + * property. If found, apply the mapping to @id_in. + * If @msi_np points to a non-NULL device node pointer, only entries targeting + * that node will be matched; if it points to a NULL value, it will receive the + * device node of the first matching target phandle, with a reference held. * * Returns: The mapped MSI id. */ @@ -699,22 +700,6 @@ u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, u32 id_in) return id_out; } -/** - * of_msi_map_id - Map a MSI ID for a device. - * @dev: device for which the mapping is to be done. - * @msi_np: device node of the expected msi controller. - * @id_in: unmapped MSI ID for the device. - * - * Walk up the device hierarchy looking for devices with a "msi-map" - * property. If found, apply the mapping to @id_in. - * - * Return: The mapped MSI ID. - */ -u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in) -{ - return of_msi_xlate(dev, &msi_np, id_in); -} - /** * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain * @dev: device for which the mapping is to be done. diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 0938ef7ebabf..555c61b1fc36 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -422,7 +422,7 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev) pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); of_node = irq_domain_get_of_node(domain); - rid = of_node ? of_msi_map_id(&pdev->dev, of_node, rid) : + rid = of_node ? of_msi_xlate(&pdev->dev, &of_node, rid) : iort_msi_map_id(&pdev->dev, rid); return rid; diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index a480063c9cb1..1db8543dfc8a 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -55,7 +55,6 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 bus_token); extern void of_msi_configure(struct device *dev, const struct device_node *np); extern u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, u32 id_in); -u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in); #else static inline void of_irq_init(const struct of_device_id *matches) { @@ -105,11 +104,6 @@ static inline u32 of_msi_xlate(struct device *dev, struct device_node **msi_np, { return id_in; } -static inline u32 of_msi_map_id(struct device *dev, - struct device_node *msi_np, u32 id_in) -{ - return id_in; -} #endif #if defined(CONFIG_OF_IRQ) || defined(CONFIG_SPARC) -- cgit v1.2.3 From 54dbd2a8e974b900b18639e75f62702a4334ddc0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 3 Sep 2025 14:52:56 +0300 Subject: PCI/P2PDMA: Reduce scope of pci_has_p2pmem() pci_has_p2pmem() is not used outside of p2pdma.c, and there is no need to export it for use by modules. Signed-off-by: Leon Romanovsky Signed-off-by: Bjorn Helgaas Reviewed-by: Logan Gunthorpe Link: https://patch.msgid.link/d40f3f1decf54c9236bc38b48a6aae612a5c182f.1756900291.git.leon@kernel.org --- drivers/pci/p2pdma.c | 3 +-- include/linux/pci-p2pdma.h | 5 ----- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 1cb5e423eed4..78e108e47254 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -738,7 +738,7 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_distance_many); * pci_has_p2pmem - check if a given PCI device has published any p2pmem * @pdev: PCI device to check */ -bool pci_has_p2pmem(struct pci_dev *pdev) +static bool pci_has_p2pmem(struct pci_dev *pdev) { struct pci_p2pdma *p2pdma; bool res; @@ -750,7 +750,6 @@ bool pci_has_p2pmem(struct pci_dev *pdev) return res; } -EXPORT_SYMBOL_GPL(pci_has_p2pmem); /** * pci_p2pmem_find_many - find a peer-to-peer DMA memory device compatible with diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 075c20b161d9..951f81a38f3a 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -21,7 +21,6 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, int num_clients, bool verbose); -bool pci_has_p2pmem(struct pci_dev *pdev); struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients); void *pci_alloc_p2pmem(struct pci_dev *pdev, size_t size); void pci_free_p2pmem(struct pci_dev *pdev, void *addr, size_t size); @@ -45,10 +44,6 @@ static inline int pci_p2pdma_distance_many(struct pci_dev *provider, { return -1; } -static inline bool pci_has_p2pmem(struct pci_dev *pdev) -{ - return false; -} static inline struct pci_dev *pci_p2pmem_find_many(struct device **clients, int num_clients) { -- cgit v1.2.3 From dd386b0d5e61556927189cd7b59a628d22cb6851 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 1 Sep 2025 19:26:07 -0600 Subject: io_uring/uring_cmd: correct io_uring_cmd_done() ret type io_uring_cmd_done() takes the result code for the CQE as a ssize_t ret argument. However, the CQE res field is a s32 value, as is the argument to io_req_set_res(). To clarify that only s32 values can be faithfully represented without truncation, change io_uring_cmd_done()'s ret argument type to s32. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250902012609.1513123-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 4 ++-- io_uring/uring_cmd.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 7211157edfe9..c4d7874016bb 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -56,7 +56,7 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, * Note: the caller should never hard code @issue_flags and is only allowed * to pass the mask provided by the core io_uring code. */ -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, +void io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, unsigned issue_flags); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, @@ -104,7 +104,7 @@ static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 ret2, unsigned issue_flags) { } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index d76d6d27765c..5562e8491c5b 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -151,7 +151,7 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, unsigned issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); -- cgit v1.2.3 From 9f8608fce90fbcd2a98ceefad0bc762423927629 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 1 Sep 2025 19:33:27 -0600 Subject: io_uring/cmd: remove unused io_uring_cmd_iopoll_done() io_uring_cmd_iopoll_done()'s only caller was removed in commit 9ce6c9875f3e ("nvme: always punt polled uring_cmd end_io work to task_work"). So remove the unused function too. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250902013328.1517686-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c4d7874016bb..50dd6a53cb5e 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -133,17 +133,6 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, } #endif -/* - * Polled completions must ensure they are coming from a poll queue, and - * hence are completed inside the usual poll handling loops. - */ -static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, - ssize_t ret, ssize_t res2) -{ - lockdep_assert(in_task()); - io_uring_cmd_done(ioucmd, ret, res2, 0); -} - /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, io_uring_cmd_tw_t task_work_cb) -- cgit v1.2.3 From 8c94db0ae97c72c253a615f990bd466b456e94f6 Mon Sep 17 00:00:00 2001 From: Svetlana Parfenova Date: Mon, 1 Sep 2025 20:53:50 +0700 Subject: binfmt_elf: preserve original ELF e_flags for core dumps Some architectures, such as RISC-V, use the ELF e_flags field to encode ABI-specific information (e.g., ISA extensions, fpu support). Debuggers like GDB rely on these flags in core dumps to correctly interpret optional register sets. If the flags are missing or incorrect, GDB may warn and ignore valid data, for example: warning: Unexpected size of section '.reg2/213' in core file. This can prevent access to fpu or other architecture-specific registers even when they were dumped. Save the e_flags field during ELF binary loading (in load_elf_binary()) into the mm_struct, and later retrieve it during core dump generation (in fill_note_info()). Kconfig option CONFIG_ARCH_HAS_ELF_CORE_EFLAGS is introduced for architectures that require this behaviour. Signed-off-by: Svetlana Parfenova Link: https://lore.kernel.org/r/20250901135350.619485-1-svetlana.parfenova@syntacore.com Signed-off-by: Kees Cook --- arch/riscv/Kconfig | 1 + fs/Kconfig.binfmt | 9 +++++++++ fs/binfmt_elf.c | 40 ++++++++++++++++++++++++++++++++++------ include/linux/mm_types.h | 5 +++++ 4 files changed, 49 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659e..b06a758abb63 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -28,6 +28,7 @@ config RISCV select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX + select ARCH_HAS_ELF_CORE_EFLAGS select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index bd2f530e5740..1949e25c7741 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -184,4 +184,13 @@ config EXEC_KUNIT_TEST This builds the exec KUnit tests, which tests boundary conditions of various aspects of the exec internals. +config ARCH_HAS_ELF_CORE_EFLAGS + bool + depends on BINFMT_ELF && ELF_CORE + default n + help + Select this option if the architecture makes use of the e_flags + field in the ELF header to store ABI or other architecture-specific + information that should be preserved in core dumps. + endmenu diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 4aacf9c9cc2d..e4653bb99946 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -103,6 +103,21 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) +static inline void elf_coredump_set_mm_eflags(struct mm_struct *mm, u32 flags) +{ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + mm->saved_e_flags = flags; +#endif +} + +static inline u32 elf_coredump_get_mm_eflags(struct mm_struct *mm, u32 flags) +{ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + flags = mm->saved_e_flags; +#endif + return flags; +} + /* * We need to explicitly zero any trailing portion of the page that follows * p_filesz when it ends before the page ends (e.g. bss), otherwise this @@ -1290,6 +1305,8 @@ out_free_interp: mm->end_data = end_data; mm->start_stack = bprm->p; + elf_coredump_set_mm_eflags(mm, elf_ex->e_flags); + /** * DOC: "brk" handling * @@ -1804,6 +1821,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_thread_core_info *t; struct elf_prpsinfo *psinfo; struct core_thread *ct; + u16 machine; + u32 flags; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) @@ -1831,17 +1850,26 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, return 0; } - /* - * Initialize the ELF file header. - */ - fill_elf_header(elf, phdrs, - view->e_machine, view->e_flags); + machine = view->e_machine; + flags = view->e_flags; #else view = NULL; info->thread_notes = 2; - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); + machine = ELF_ARCH; + flags = ELF_CORE_EFLAGS; #endif + /* + * Override ELF e_flags with value taken from process, + * if arch needs that. + */ + flags = elf_coredump_get_mm_eflags(dump_task->mm, flags); + + /* + * Initialize the ELF file header. + */ + fill_elf_header(elf, phdrs, machine, flags); + /* * Allocate a structure for each thread. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db93..04a2857f12f2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1102,6 +1102,11 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ +#ifdef CONFIG_ARCH_HAS_ELF_CORE_EFLAGS + /* the ABI-related flags from the ELF header. Used for core dump */ + unsigned long saved_e_flags; +#endif + struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; -- cgit v1.2.3 From e0c47c6229c25b54440fe1f84a0ff533942290b1 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Fri, 25 Jul 2025 23:22:20 +1000 Subject: wifi: mac80211: support parsing S1G TIM PVB An S1G TIM PVB has 3 mandatory encoding modes, that being block bitmap, single AID and OBL alongside the ability for each encoding mode to be inverted. Introduce the ability to parse the 3 encoding formats. The implementation specification for the encoding formats can be found in IEEE80211-2024 9.4.2.5. Signed-off-by: Arien Judge Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250725132221.258217-3-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/carl9170/rx.c | 2 +- drivers/net/wireless/intersil/p54/txrx.c | 2 +- drivers/net/wireless/ralink/rt2x00/rt2x00dev.c | 2 +- drivers/net/wireless/realtek/rtlwifi/ps.c | 2 +- include/linux/ieee80211.h | 265 ++++++++++++++++++++++++- net/mac80211/mesh_ps.c | 2 +- net/mac80211/mlme.c | 3 +- 7 files changed, 263 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/ath/carl9170/rx.c b/drivers/net/wireless/ath/carl9170/rx.c index 908c4c8b7f82..6833430130f4 100644 --- a/drivers/net/wireless/ath/carl9170/rx.c +++ b/drivers/net/wireless/ath/carl9170/rx.c @@ -555,7 +555,7 @@ static void carl9170_ps_beacon(struct ar9170 *ar, void *data, unsigned int len) /* Check whenever the PHY can be turned off again. */ /* 1. What about buffered unicast traffic for our AID? */ - cam = ieee80211_check_tim(tim_ie, tim_len, ar->common.curaid); + cam = ieee80211_check_tim(tim_ie, tim_len, ar->common.curaid, false); /* 2. Maybe the AP wants to send multicast/broadcast data? */ cam |= !!(tim_ie->bitmap_ctrl & 0x01); diff --git a/drivers/net/wireless/intersil/p54/txrx.c b/drivers/net/wireless/intersil/p54/txrx.c index 2deb1bb54f24..1294a1d6528e 100644 --- a/drivers/net/wireless/intersil/p54/txrx.c +++ b/drivers/net/wireless/intersil/p54/txrx.c @@ -317,7 +317,7 @@ static void p54_pspoll_workaround(struct p54_common *priv, struct sk_buff *skb) tim_len = tim[1]; tim_ie = (struct ieee80211_tim_ie *) &tim[2]; - new_psm = ieee80211_check_tim(tim_ie, tim_len, priv->aid); + new_psm = ieee80211_check_tim(tim_ie, tim_len, priv->aid, false); if (new_psm != priv->powersave_override) { priv->powersave_override = new_psm; p54_set_ps(priv); diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c index 7db29e90eb4f..f8a6f9c968a1 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00dev.c @@ -679,7 +679,7 @@ static void rt2x00lib_rxdone_check_ps(struct rt2x00_dev *rt2x00dev, /* Check whenever the PHY can be turned off again. */ /* 1. What about buffered unicast traffic for our AID? */ - cam = ieee80211_check_tim(tim_ie, tim_len, rt2x00dev->aid); + cam = ieee80211_check_tim(tim_ie, tim_len, rt2x00dev->aid, false); /* 2. Maybe the AP wants to send multicast/broadcast data? */ cam |= (tim_ie->bitmap_ctrl & 0x01); diff --git a/drivers/net/wireless/realtek/rtlwifi/ps.c b/drivers/net/wireless/realtek/rtlwifi/ps.c index 6241e4fed4f6..bcab12c3b4c1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/ps.c +++ b/drivers/net/wireless/realtek/rtlwifi/ps.c @@ -519,7 +519,7 @@ void rtl_swlps_beacon(struct ieee80211_hw *hw, void *data, unsigned int len) /* 1. What about buffered unicast traffic for our AID? */ u_buffed = ieee80211_check_tim(tim_ie, tim_len, - rtlpriv->mac80211.assoc_id); + rtlpriv->mac80211.assoc_id, false); /* 2. Maybe the AP wants to send multicast/broadcast data? */ m_buffed = tim_ie->bitmap_ctrl & 0x01; diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e5a2096e022e..d350263f23f3 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -220,6 +220,12 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_AID_S1G 8191 #define IEEE80211_MAX_TIM_LEN 251 #define IEEE80211_MAX_MESH_PEERINGS 63 + +/* S1G encoding types */ +#define IEEE80211_S1G_TIM_ENC_MODE_BLOCK 0 +#define IEEE80211_S1G_TIM_ENC_MODE_SINGLE 1 +#define IEEE80211_S1G_TIM_ENC_MODE_OLB 2 + /* Maximum size for the MA-UNITDATA primitive, 802.11 standard section 6.2.1.1.2. @@ -4757,15 +4763,8 @@ static inline unsigned long ieee80211_tu_to_usec(unsigned long tu) return 1024 * tu; } -/** - * ieee80211_check_tim - check if AID bit is set in TIM - * @tim: the TIM IE - * @tim_len: length of the TIM IE - * @aid: the AID to look for - * Return: whether or not traffic is indicated in the TIM for the given AID - */ -static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, - u8 tim_len, u16 aid) +static inline bool __ieee80211_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid) { u8 mask; u8 index, indexn1, indexn2; @@ -4788,6 +4787,254 @@ static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, return !!(tim->virtual_map[index] & mask); } +struct s1g_tim_aid { + u16 aid; + u8 target_blk; /* Target block index */ + u8 target_subblk; /* Target subblock index */ + u8 target_subblk_bit; /* Target subblock bit */ +}; + +struct s1g_tim_enc_block { + u8 enc_mode; + bool inverse; + const u8 *ptr; + u8 len; + + /* + * For an OLB encoded block that spans multiple blocks, this + * is the offset into the span described by that encoded block. + */ + u8 olb_blk_offset; +}; + +/* + * Helper routines to quickly extract the length of an encoded block. Validation + * is also performed to ensure the length extracted lies within the TIM. + */ + +static inline int ieee80211_s1g_len_bitmap(const u8 *ptr, const u8 *end) +{ + u8 blkmap; + u8 n_subblks; + + if (ptr >= end) + return -EINVAL; + + blkmap = *ptr; + n_subblks = hweight8(blkmap); + + if (ptr + 1 + n_subblks > end) + return -EINVAL; + + return 1 + n_subblks; +} + +static inline int ieee80211_s1g_len_single(const u8 *ptr, const u8 *end) +{ + return (ptr + 1 > end) ? -EINVAL : 1; +} + +static inline int ieee80211_s1g_len_olb(const u8 *ptr, const u8 *end) +{ + if (ptr >= end) + return -EINVAL; + + return (ptr + 1 + *ptr > end) ? -EINVAL : 1 + *ptr; +} + +/* + * Enumerate all encoded blocks until we find the encoded block that describes + * our target AID. OLB is a special case as a single encoded block can describe + * multiple blocks as a single encoded block. + */ +static inline int ieee80211_s1g_find_target_block(struct s1g_tim_enc_block *enc, + const struct s1g_tim_aid *aid, + const u8 *ptr, const u8 *end) +{ + /* need at least block-control octet */ + while (ptr + 1 <= end) { + u8 ctrl = *ptr++; + u8 mode = ctrl & 0x03; + bool contains, inverse = ctrl & BIT(2); + u8 span, blk_off = ctrl >> 3; + int len; + + switch (mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: + len = ieee80211_s1g_len_bitmap(ptr, end); + contains = blk_off == aid->target_blk; + break; + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: + len = ieee80211_s1g_len_single(ptr, end); + contains = blk_off == aid->target_blk; + break; + case IEEE80211_S1G_TIM_ENC_MODE_OLB: + len = ieee80211_s1g_len_olb(ptr, end); + /* + * An OLB encoded block can describe more then one + * block, meaning an encoded OLB block can span more + * then a single block. + */ + if (len > 0) { + /* Minus one for the length octet */ + span = DIV_ROUND_UP(len - 1, 8); + /* + * Check if our target block lies within the + * block span described by this encoded block. + */ + contains = (aid->target_blk >= blk_off) && + (aid->target_blk < blk_off + span); + } + break; + default: + return -EOPNOTSUPP; + } + + if (len < 0) + return len; + + if (contains) { + enc->enc_mode = mode; + enc->inverse = inverse; + enc->ptr = ptr; + enc->len = (u8)len; + enc->olb_blk_offset = blk_off; + return 0; + } + + ptr += len; + } + + return -ENOENT; +} + +static inline bool ieee80211_s1g_parse_bitmap(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + const u8 *ptr = enc->ptr; + u8 blkmap = *ptr++; + + /* + * If our block bitmap does not contain a set bit that corresponds + * to our AID, it could mean a variety of things depending on if + * the encoding mode is inverted or not. + * + * 1. If inverted, it means the entire subblock is present and hence + * our AID has been set. + * 2. If not inverted, it means our subblock is not present and hence + * it is all zero meaning our AID is not set. + */ + if (!(blkmap & BIT(aid->target_subblk))) + return enc->inverse; + + /* + * Increment ptr by the number of set subblocks that appear before our + * target subblock. If our target subblock is 0, do nothing as ptr + * already points to our target subblock. + */ + if (aid->target_subblk) + ptr += hweight8(blkmap & GENMASK(aid->target_subblk - 1, 0)); + + return !!(*ptr & BIT(aid->target_subblk_bit)) ^ enc->inverse; +} + +static inline bool ieee80211_s1g_parse_single(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + /* + * Single AID mode describes, as the name suggests, a single AID + * within the block described by the encoded block. The octet + * contains the 6 LSBs of the AID described in the block. The other + * 2 bits are reserved. When inversed, every single AID described + * by the current block have buffered traffic except for the AID + * described in the single AID octet. + */ + return ((*enc->ptr & 0x3f) == (aid->aid & 0x3f)) ^ enc->inverse; +} + +static inline bool ieee80211_s1g_parse_olb(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + const u8 *ptr = enc->ptr; + u8 blk_len = *ptr++; + /* + * Given an OLB encoded block that describes multiple blocks, + * calculate the offset into the span. Then calculate the + * subblock location normally. + */ + u16 span_offset = aid->target_blk - enc->olb_blk_offset; + u16 subblk_idx = span_offset * 8 + aid->target_subblk; + + if (subblk_idx >= blk_len) + return enc->inverse; + + return !!(ptr[subblk_idx] & BIT(aid->target_subblk_bit)) ^ enc->inverse; +} + +/* + * An S1G PVB has 3 non optional encoding types, each that can be inverted. + * An S1G PVB is constructed with zero or more encoded block subfields. Each + * encoded block represents a single "block" of AIDs (64), and each encoded + * block can contain one of the 3 encoding types alongside a single bit for + * whether the bits should be inverted. + * + * As the standard makes no guarantee about the ordering of encoded blocks, + * we must parse every encoded block in the worst case scenario given an + * AID that lies within the last block. + */ +static inline bool ieee80211_s1g_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid) +{ + int err; + struct s1g_tim_aid target_aid; + struct s1g_tim_enc_block enc_blk; + + if (tim_len < 3) + return false; + + target_aid.aid = aid; + target_aid.target_blk = (aid >> 6) & 0x1f; + target_aid.target_subblk = (aid >> 3) & 0x7; + target_aid.target_subblk_bit = aid & 0x7; + + /* + * Find our AIDs target encoded block and fill &enc_blk with the + * encoded blocks information. If no entry is found or an error + * occurs return false. + */ + err = ieee80211_s1g_find_target_block(&enc_blk, &target_aid, + tim->virtual_map, + (const u8 *)tim + tim_len + 2); + if (err) + return false; + + switch (enc_blk.enc_mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: + return ieee80211_s1g_parse_bitmap(&enc_blk, &target_aid); + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: + return ieee80211_s1g_parse_single(&enc_blk, &target_aid); + case IEEE80211_S1G_TIM_ENC_MODE_OLB: + return ieee80211_s1g_parse_olb(&enc_blk, &target_aid); + default: + return false; + } +} + +/** + * ieee80211_check_tim - check if AID bit is set in TIM + * @tim: the TIM IE + * @tim_len: length of the TIM IE + * @aid: the AID to look for + * @s1g: whether the TIM is from an S1G PPDU + * Return: whether or not traffic is indicated in the TIM for the given AID + */ +static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid, bool s1g) +{ + return s1g ? ieee80211_s1g_check_tim(tim, tim_len, aid) : + __ieee80211_check_tim(tim, tim_len, aid); +} + /** * ieee80211_get_tdls_action - get TDLS action code * @skb: the skb containing the frame, length will not be checked diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 20e022a03933..ebab1f0a0138 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -586,7 +586,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len, - sta->mesh->aid); + sta->mesh->aid, false); if (has_buffered) mps_dbg(sta->sdata, "%pM indicates buffered frames\n", diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index f5d09ded9827..9568cc95a7ff 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7438,7 +7438,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, ncrc = elems->crc; if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && - ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid)) { + ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid, + vif_cfg->s1g)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; -- cgit v1.2.3 From d8b269e009bbc471cb2735b5f737839495efce3b Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Thu, 4 Sep 2025 15:45:05 +0800 Subject: cgroup: Remove unused cgroup_subsys::post_attach cgroup_subsys::post_attach callback was introduced in commit 5cf1cacb49ae ("cgroup, cpuset: replace cpuset_post_attach_flush() with cgroup_subsys->post_attach callback") and only cpuset would use this callback to wait for the mm migration to complete at the end of __cgroup_procs_write(). Since the previous patch defer the flush operation until returning to userspace, no one use this callback now. Remove this callback from cgroup_subsys. Signed-off-by: Chuyi Zhou Acked-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 - kernel/cgroup/cgroup.c | 4 ---- 2 files changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 539c64eeef38..92ed6d18266d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -763,7 +763,6 @@ struct cgroup_subsys { int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); - void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b38d7a847ed4..0dc6ad71f175 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3033,10 +3033,6 @@ void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked put_task_struct(task); cgroup_attach_unlock(threadgroup_locked); - - for_each_subsys(ss, ssid) - if (ss->post_attach) - ss->post_attach(); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) -- cgit v1.2.3 From b28f9eba12a4967eff6e8a1c0512f86f1ac7fa68 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 28 Jun 2025 11:37:30 -0400 Subject: change the calling conventions for vfs_parse_fs_string() Absolute majority of callers are passing the 4th argument equal to strlen() of the 3rd one. Drop the v_size argument, add vfs_parse_fs_qstr() for the cases that want independent length. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- Documentation/filesystems/mount_api.rst | 10 +++++++++- Documentation/filesystems/porting.rst | 12 ++++++++++++ drivers/gpu/drm/i915/gem/i915_gemfs.c | 9 ++------- drivers/gpu/drm/v3d/v3d_gemfs.c | 9 ++------- fs/afs/mntpt.c | 3 ++- fs/fs_context.c | 17 +++++++---------- fs/namespace.c | 8 +++----- fs/nfs/fs_context.c | 3 +-- fs/nfs/namespace.c | 3 ++- fs/smb/client/fs_context.c | 4 +--- include/linux/fs_context.h | 9 +++++++-- kernel/trace/trace.c | 3 +-- 12 files changed, 49 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst index e149b89118c8..c99ab1f7fea4 100644 --- a/Documentation/filesystems/mount_api.rst +++ b/Documentation/filesystems/mount_api.rst @@ -504,10 +504,18 @@ returned. clear the pointer, but then becomes responsible for disposing of the object. + * :: + + int vfs_parse_fs_qstr(struct fs_context *fc, const char *key, + const struct qstr *value); + + A wrapper around vfs_parse_fs_param() that copies the value string it is + passed. + * :: int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size); + const char *value); A wrapper around vfs_parse_fs_param() that copies the value string it is passed. diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f07..ab48ab3f6eb2 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1285,3 +1285,15 @@ rather than a VMA, as the VMA at this stage is not yet valid. The vm_area_desc provides the minimum required information for a filesystem to initialise state upon memory mapping of a file-backed region, and output parameters for the file system to set this state. + +--- + +**mandatory** + +Calling conventions for vfs_parse_fs_string() have changed; it does *not* +take length anymore (value ? strlen(value) : 0 is used). If you want +a different length, use + + vfs_parse_fs_qstr(fc, key, &QSTR_LEN(value, len)) + +instead. diff --git a/drivers/gpu/drm/i915/gem/i915_gemfs.c b/drivers/gpu/drm/i915/gem/i915_gemfs.c index a09e2eb47175..8f13ec4ff0d0 100644 --- a/drivers/gpu/drm/i915/gem/i915_gemfs.c +++ b/drivers/gpu/drm/i915/gem/i915_gemfs.c @@ -11,11 +11,6 @@ #include "i915_gemfs.h" #include "i915_utils.h" -static int add_param(struct fs_context *fc, const char *key, const char *val) -{ - return vfs_parse_fs_string(fc, key, val, strlen(val)); -} - void i915_gemfs_init(struct drm_i915_private *i915) { struct file_system_type *type; @@ -48,9 +43,9 @@ void i915_gemfs_init(struct drm_i915_private *i915) fc = fs_context_for_mount(type, SB_KERNMOUNT); if (IS_ERR(fc)) goto err; - ret = add_param(fc, "source", "tmpfs"); + ret = vfs_parse_fs_string(fc, "source", "tmpfs"); if (!ret) - ret = add_param(fc, "huge", "within_size"); + ret = vfs_parse_fs_string(fc, "huge", "within_size"); if (!ret) gemfs = fc_mount_longterm(fc); put_fs_context(fc); diff --git a/drivers/gpu/drm/v3d/v3d_gemfs.c b/drivers/gpu/drm/v3d/v3d_gemfs.c index 8ec6ed82b3d9..c1a30166c099 100644 --- a/drivers/gpu/drm/v3d/v3d_gemfs.c +++ b/drivers/gpu/drm/v3d/v3d_gemfs.c @@ -7,11 +7,6 @@ #include "v3d_drv.h" -static int add_param(struct fs_context *fc, const char *key, const char *val) -{ - return vfs_parse_fs_string(fc, key, val, strlen(val)); -} - void v3d_gemfs_init(struct v3d_dev *v3d) { struct file_system_type *type; @@ -38,9 +33,9 @@ void v3d_gemfs_init(struct v3d_dev *v3d) fc = fs_context_for_mount(type, SB_KERNMOUNT); if (IS_ERR(fc)) goto err; - ret = add_param(fc, "source", "tmpfs"); + ret = vfs_parse_fs_string(fc, "source", "tmpfs"); if (!ret) - ret = add_param(fc, "huge", "within_size"); + ret = vfs_parse_fs_string(fc, "huge", "within_size"); if (!ret) gemfs = fc_mount_longterm(fc); put_fs_context(fc); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 9434a5399f2b..1ad048e6e164 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -137,7 +137,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ret = -EINVAL; if (content[size - 1] == '.') - ret = vfs_parse_fs_string(fc, "source", content, size - 1); + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(content, size - 1)); do_delayed_call(&cleanup); if (ret < 0) return ret; diff --git a/fs/fs_context.c b/fs/fs_context.c index 666e61753aed..93b7ebf8d927 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -161,25 +161,24 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) EXPORT_SYMBOL(vfs_parse_fs_param); /** - * vfs_parse_fs_string - Convenience function to just parse a string. + * vfs_parse_fs_qstr - Convenience function to just parse a string. * @fc: Filesystem context. * @key: Parameter name. * @value: Default value. - * @v_size: Maximum number of bytes in the value. */ -int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size) +int vfs_parse_fs_qstr(struct fs_context *fc, const char *key, + const struct qstr *value) { int ret; struct fs_parameter param = { .key = key, .type = fs_value_is_flag, - .size = v_size, + .size = value ? value->len : 0, }; if (value) { - param.string = kmemdup_nul(value, v_size, GFP_KERNEL); + param.string = kmemdup_nul(value->name, value->len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; @@ -189,7 +188,7 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key, kfree(param.string); return ret; } -EXPORT_SYMBOL(vfs_parse_fs_string); +EXPORT_SYMBOL(vfs_parse_fs_qstr); /** * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data @@ -218,16 +217,14 @@ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, while ((key = sep(&options)) != NULL) { if (*key) { - size_t v_len = 0; char *value = strchr(key, '='); if (value) { if (unlikely(value == key)) continue; *value++ = 0; - v_len = strlen(value); } - ret = vfs_parse_fs_string(fc, key, value, v_len); + ret = vfs_parse_fs_string(fc, key, value); if (ret < 0) break; } diff --git a/fs/namespace.c b/fs/namespace.c index ddfd4457d338..88636124c8fe 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1281,8 +1281,7 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type, return ERR_CAST(fc); if (name) - ret = vfs_parse_fs_string(fc, "source", - name, strlen(name)); + ret = vfs_parse_fs_string(fc, "source", name); if (!ret) ret = parse_monolithic_mount_data(fc, data); if (!ret) @@ -3793,10 +3792,9 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, fc->oldapi = true; if (subtype) - err = vfs_parse_fs_string(fc, "subtype", - subtype, strlen(subtype)); + err = vfs_parse_fs_string(fc, "subtype", subtype); if (!err && name) - err = vfs_parse_fs_string(fc, "source", name, strlen(name)); + err = vfs_parse_fs_string(fc, "source", name); if (!err) err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc)) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 9e94d18448ff..b4679b7161b0 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1269,8 +1269,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, int ret; data->context[NFS_MAX_CONTEXT_LEN] = '\0'; - ret = vfs_parse_fs_string(fc, "context", - data->context, strlen(data->context)); + ret = vfs_parse_fs_string(fc, "context", data->context); if (ret < 0) return ret; #else diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 7f1ec9c67ff2..5735c0448b4c 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -290,7 +290,8 @@ int nfs_do_submount(struct fs_context *fc) nfs_errorf(fc, "NFS: Couldn't determine submount pathname"); ret = PTR_ERR(p); } else { - ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p); + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(p, buffer + 4096 - p)); if (!ret) ret = vfs_get_tree(fc); } diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 072383899e81..306b76e97783 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -773,16 +773,14 @@ static int smb3_fs_context_parse_monolithic(struct fs_context *fc, } - len = 0; value = strchr(key, '='); if (value) { if (value == key) continue; *value++ = 0; - len = strlen(value); } - ret = vfs_parse_fs_string(fc, key, value, len); + ret = vfs_parse_fs_string(fc, key, value); if (ret < 0) break; } diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 7773eb870039..97b514a79a49 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -134,8 +134,13 @@ extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_ty extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc); extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param); -extern int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size); +extern int vfs_parse_fs_qstr(struct fs_context *fc, const char *key, + const struct qstr *value); +static inline int vfs_parse_fs_string(struct fs_context *fc, const char *key, + const char *value) +{ + return vfs_parse_fs_qstr(fc, key, value ? &QSTR(value) : NULL); +} int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, char *(*sep)(char **)); extern int generic_parse_monolithic(struct fs_context *fc, void *data); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4283ed4e8f59..15375b45fd74 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10201,8 +10201,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); - ret = vfs_parse_fs_string(fc, "source", - "tracefs", strlen("tracefs")); + ret = vfs_parse_fs_string(fc, "source", "tracefs"); if (!ret) mnt = fc_mount(fc); else -- cgit v1.2.3 From 038c7dc66e2744e5df57163b8f957745ae10d23e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Sep 2025 20:46:40 -0700 Subject: compiler_types.h: Move __nocfi out of compiler-specific header Prepare for GCC KCFI support and move the __nocfi attribute from compiler-clang.h to compiler_types.h. This was already gated by CONFIG_CFI_CLANG, so this remains safe for non-KCFI GCC builds. Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250904034656.3670313-1-kees@kernel.org --- include/linux/compiler-clang.h | 5 ----- include/linux/compiler_types.h | 4 +++- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index fa4ffe037bc7..7a4568e421dc 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -96,11 +96,6 @@ # define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif -#if __has_feature(kcfi) -/* Disable CFI checking inside a function. */ -#define __nocfi __attribute__((__no_sanitize__("kcfi"))) -#endif - /* * Turn individual warnings and errors on and off locally, depending * on version. diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 16755431fc11..a910f9fa5341 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -432,7 +432,9 @@ struct ftrace_likely_data { # define __noscs #endif -#ifndef __nocfi +#if defined(CONFIG_CFI_CLANG) +# define __nocfi __attribute__((__no_sanitize__("kcfi"))) +#else # define __nocfi #endif -- cgit v1.2.3 From 0b815825b1b0bd6762ca028e9b6631b002efb7ca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Sep 2025 20:46:45 -0700 Subject: x86/cfi: Remove __noinitretpoline and __noretpoline Commit 66f793099a63 ("x86/retpoline: Avoid retpolines for built-in __init functions") disabled retpolines in __init sections (__noinitretpoline) as a precaution against potential issues with retpolines in early boot, but it has not been a problem in practice (i.e. see Clang below). Commit 87358710c1fb ("x86/retpoline: Support retpoline builds with Clang") narrowed this to only GCC, as Clang doesn't have per-function control over retpoline emission. As such, Clang has been booting with retpolines in __init since retpoline support was introduced. Clang KCFI has been instrumenting __init since CFI was introduced. With the introduction of KCFI for GCC, KCFI instrumentation with retpolines disabled means that objtool does not construct .retpoline_sites section entries for the non-retpoline KCFI calls. At boot, the KCFI rehashing code, via __apply_fineibt(), misses all __init KCFI calls (since they are not retpolines), resulting in immediate hash mismatches: all preambles are rehashed (via .cfi_sites) and none of the __init call sites are rehashed. Remove __noinitretpoline since it provides no meaningful utility and creates problems with CFI. Additionally remove __noretpoline since it is now unused. Alternatively, cfi_rand_callers() could walk the .kcfi_traps section which is exactly the list of KCFI instrumentation sites. But it seems better to have as few differences in common instruction sequences between compilers as possible, so better to remove the special handling of retpolines in __init for GCC. Signed-off-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250904034656.3670313-6-kees@kernel.org --- include/linux/compiler-gcc.h | 4 ---- include/linux/init.h | 8 -------- 2 files changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 5d07c469b571..5de824a0b3d7 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -35,10 +35,6 @@ (typeof(ptr)) (__ptr + (off)); \ }) -#ifdef CONFIG_MITIGATION_RETPOLINE -#define __noretpoline __attribute__((__indirect_branch__("keep"))) -#endif - #if defined(LATENT_ENTROPY_PLUGIN) && !defined(__CHECKER__) #define __latent_entropy __attribute__((latent_entropy)) #endif diff --git a/include/linux/init.h b/include/linux/init.h index a60d32d227ee..17c1bc712e23 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -7,13 +7,6 @@ #include #include -/* Built-in __init functions needn't be compiled with retpoline */ -#if defined(__noretpoline) && !defined(MODULE) -#define __noinitretpoline __noretpoline -#else -#define __noinitretpoline -#endif - /* These macros are used to mark some functions or * initialized data (doesn't apply to uninitialized data) * as `initialization' functions. The kernel can take this @@ -50,7 +43,6 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ #define __init __section(".init.text") __cold __latent_entropy \ - __noinitretpoline \ __no_kstack_erase #define __initdata __section(".init.data") #define __initconst __section(".init.rodata") -- cgit v1.2.3 From 54728bd535fb3899ad51489dc1e05eb5bb53cb95 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 1 Sep 2025 15:27:43 +0200 Subject: bpf: Return an error pointer for skb metadata when CONFIG_NET=n Kernel Test Robot reported a compiler warning - a null pointer may be passed to memmove in __bpf_dynptr_{read,write} when building without networking support. The warning is correct from a static analysis standpoint, but not actually reachable. Without CONFIG_NET, creating dynptrs to skb metadata is impossible since the constructor kfunc is missing. Silence the false-postive diagnostic message by returning an error pointer from bpf_skb_meta_pointer stub when CONFIG_NET=n. Fixes: 6877cd392bae ("bpf: Enable read/write access to skb metadata through a dynptr") Closes: https://lore.kernel.org/oe-kbuild-all/202508212031.ir9b3B6Q-lkp@intel.com/ Reported-by: kernel test robot Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250901-dynptr-skb-meta-no-net-v2-1-ce607fcb6091@cloudflare.com --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 9ed21b65e2e9..af6d9354662c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1822,7 +1822,7 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) { - return NULL; + return ERR_PTR(-EOPNOTSUPP); } #endif /* CONFIG_NET */ -- cgit v1.2.3 From 74a0e72f03ffd01b5d88b411f02d9b9861fdb99e Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Thu, 21 Aug 2025 12:15:59 +0200 Subject: iommu/io-pgtable-dart: Add 4-level page table support DARTs on t602x SoCs are of the t8110 variant but have an IAS of 42, which means optional support for an extra page table level. Refactor the PTE management to support an arbitrary level count, and then calculate how many levels we need for any given configuration. Signed-off-by: Hector Martin Signed-off-by: Janne Grunau Reviewed-by: Sven Peter Reviewed-by: Neal Gompa Link: https://lore.kernel.org/r/20250821-apple-dart-4levels-v2-2-e39af79daa37@jannau.net Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-dart.c | 139 +++++++++++++++++++++++++--------------- include/linux/io-pgtable.h | 1 + 2 files changed, 87 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c index 679bda104797..9a63c80a2786 100644 --- a/drivers/iommu/io-pgtable-dart.c +++ b/drivers/iommu/io-pgtable-dart.c @@ -27,8 +27,9 @@ #define DART1_MAX_ADDR_BITS 36 -#define DART_MAX_TABLES 4 -#define DART_LEVELS 2 +#define DART_MAX_TABLE_BITS 2 +#define DART_MAX_TABLES BIT(DART_MAX_TABLE_BITS) +#define DART_MAX_LEVELS 4 /* Includes TTBR level */ /* Struct accessors */ #define io_pgtable_to_data(x) \ @@ -68,6 +69,7 @@ struct dart_io_pgtable { struct io_pgtable iop; + int levels; int tbl_bits; int bits_per_level; @@ -156,44 +158,45 @@ static dart_iopte dart_install_table(dart_iopte *table, return old; } -static int dart_get_table(struct dart_io_pgtable *data, unsigned long iova) +static int dart_get_index(struct dart_io_pgtable *data, unsigned long iova, int level) { - return (iova >> (3 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & - ((1 << data->tbl_bits) - 1); + return (iova >> (level * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & + ((1 << data->bits_per_level) - 1); } -static int dart_get_l1_index(struct dart_io_pgtable *data, unsigned long iova) -{ - - return (iova >> (2 * data->bits_per_level + ilog2(sizeof(dart_iopte)))) & - ((1 << data->bits_per_level) - 1); -} - -static int dart_get_l2_index(struct dart_io_pgtable *data, unsigned long iova) +static int dart_get_last_index(struct dart_io_pgtable *data, unsigned long iova) { return (iova >> (data->bits_per_level + ilog2(sizeof(dart_iopte)))) & ((1 << data->bits_per_level) - 1); } -static dart_iopte *dart_get_l2(struct dart_io_pgtable *data, unsigned long iova) +static dart_iopte *dart_get_last(struct dart_io_pgtable *data, unsigned long iova) { dart_iopte pte, *ptep; - int tbl = dart_get_table(data, iova); + int level = data->levels; + int tbl = dart_get_index(data, iova, level); + + if (tbl > (1 << data->tbl_bits)) + return NULL; ptep = data->pgd[tbl]; if (!ptep) return NULL; - ptep += dart_get_l1_index(data, iova); - pte = READ_ONCE(*ptep); + while (--level > 1) { + ptep += dart_get_index(data, iova, level); + pte = READ_ONCE(*ptep); - /* Valid entry? */ - if (!pte) - return NULL; + /* Valid entry? */ + if (!pte) + return NULL; - /* Deref to get level 2 table */ - return iopte_deref(pte, data); + /* Deref to get next level table */ + ptep = iopte_deref(pte, data); + } + + return ptep; } static dart_iopte dart_prot_to_pte(struct dart_io_pgtable *data, @@ -230,6 +233,7 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, int ret = 0, tbl, num_entries, max_entries, map_idx_start; dart_iopte pte, *cptep, *ptep; dart_iopte prot; + int level = data->levels; if (WARN_ON(pgsize != cfg->pgsize_bitmap)) return -EINVAL; @@ -240,31 +244,36 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) return -EINVAL; - tbl = dart_get_table(data, iova); + tbl = dart_get_index(data, iova, level); + + if (tbl > (1 << data->tbl_bits)) + return -ENOMEM; ptep = data->pgd[tbl]; - ptep += dart_get_l1_index(data, iova); - pte = READ_ONCE(*ptep); + while (--level > 1) { + ptep += dart_get_index(data, iova, level); + pte = READ_ONCE(*ptep); - /* no L2 table present */ - if (!pte) { - cptep = iommu_alloc_pages_sz(gfp, tblsz); - if (!cptep) - return -ENOMEM; + /* no table present */ + if (!pte) { + cptep = iommu_alloc_pages_sz(gfp, tblsz); + if (!cptep) + return -ENOMEM; - pte = dart_install_table(cptep, ptep, 0, data); - if (pte) - iommu_free_pages(cptep); + pte = dart_install_table(cptep, ptep, 0, data); + if (pte) + iommu_free_pages(cptep); - /* L2 table is present (now) */ - pte = READ_ONCE(*ptep); - } + /* L2 table is present (now) */ + pte = READ_ONCE(*ptep); + } - ptep = iopte_deref(pte, data); + ptep = iopte_deref(pte, data); + } /* install a leaf entries into L2 table */ prot = dart_prot_to_pte(data, iommu_prot); - map_idx_start = dart_get_l2_index(data, iova); + map_idx_start = dart_get_last_index(data, iova); max_entries = DART_PTES_PER_TABLE(data) - map_idx_start; num_entries = min_t(int, pgcount, max_entries); ptep += map_idx_start; @@ -293,13 +302,13 @@ static size_t dart_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(pgsize != cfg->pgsize_bitmap || !pgcount)) return 0; - ptep = dart_get_l2(data, iova); + ptep = dart_get_last(data, iova); /* Valid L2 IOPTE pointer? */ if (WARN_ON(!ptep)) return 0; - unmap_idx_start = dart_get_l2_index(data, iova); + unmap_idx_start = dart_get_last_index(data, iova); ptep += unmap_idx_start; max_entries = DART_PTES_PER_TABLE(data) - unmap_idx_start; @@ -330,13 +339,13 @@ static phys_addr_t dart_iova_to_phys(struct io_pgtable_ops *ops, struct dart_io_pgtable *data = io_pgtable_ops_to_data(ops); dart_iopte pte, *ptep; - ptep = dart_get_l2(data, iova); + ptep = dart_get_last(data, iova); /* Valid L2 IOPTE pointer? */ if (!ptep) return 0; - ptep += dart_get_l2_index(data, iova); + ptep += dart_get_last_index(data, iova); pte = READ_ONCE(*ptep); /* Found translation */ @@ -353,21 +362,37 @@ static struct dart_io_pgtable * dart_alloc_pgtable(struct io_pgtable_cfg *cfg) { struct dart_io_pgtable *data; - int tbl_bits, bits_per_level, va_bits, pg_shift; + int levels, max_tbl_bits, tbl_bits, bits_per_level, va_bits, pg_shift; + + /* + * Old 4K page DARTs can use up to 4 top-level tables. + * Newer ones only ever use a maximum of 1. + */ + if (cfg->pgsize_bitmap == SZ_4K) + max_tbl_bits = DART_MAX_TABLE_BITS; + else + max_tbl_bits = 0; pg_shift = __ffs(cfg->pgsize_bitmap); bits_per_level = pg_shift - ilog2(sizeof(dart_iopte)); va_bits = cfg->ias - pg_shift; - tbl_bits = max_t(int, 0, va_bits - (bits_per_level * DART_LEVELS)); - if ((1 << tbl_bits) > DART_MAX_TABLES) + levels = max_t(int, 2, (va_bits - max_tbl_bits + bits_per_level - 1) / bits_per_level); + + if (levels > (DART_MAX_LEVELS - 1)) + return NULL; + + tbl_bits = max_t(int, 0, va_bits - (bits_per_level * levels)); + + if (tbl_bits > max_tbl_bits) return NULL; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return NULL; + data->levels = levels + 1; /* Table level counts as one level */ data->tbl_bits = tbl_bits; data->bits_per_level = bits_per_level; @@ -403,6 +428,7 @@ apple_dart_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) return NULL; cfg->apple_dart_cfg.n_ttbrs = 1 << data->tbl_bits; + cfg->apple_dart_cfg.n_levels = data->levels; for (i = 0; i < cfg->apple_dart_cfg.n_ttbrs; ++i) { data->pgd[i] = @@ -422,24 +448,31 @@ out_free_data: return NULL; } -static void apple_dart_free_pgtable(struct io_pgtable *iop) +static void apple_dart_free_pgtables(struct dart_io_pgtable *data, dart_iopte *ptep, int level) { - struct dart_io_pgtable *data = io_pgtable_to_data(iop); - dart_iopte *ptep, *end; - int i; + dart_iopte *end; + dart_iopte *start = ptep; - for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i) { - ptep = data->pgd[i]; + if (level > 1) { end = (void *)ptep + DART_GRANULE(data); while (ptep != end) { dart_iopte pte = *ptep++; if (pte) - iommu_free_pages(iopte_deref(pte, data)); + apple_dart_free_pgtables(data, iopte_deref(pte, data), level - 1); } - iommu_free_pages(data->pgd[i]); } + iommu_free_pages(start); +} + +static void apple_dart_free_pgtable(struct io_pgtable *iop) +{ + struct dart_io_pgtable *data = io_pgtable_to_data(iop); + int i; + + for (i = 0; i < (1 << data->tbl_bits) && data->pgd[i]; ++i) + apple_dart_free_pgtables(data, data->pgd[i], data->levels - 1); kfree(data); } diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 138fbd89b1e6..8a823c6f2b4a 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -180,6 +180,7 @@ struct io_pgtable_cfg { struct { u64 ttbr[4]; u32 n_ttbrs; + u32 n_levels; } apple_dart_cfg; struct { -- cgit v1.2.3 From 8f77295525825086cb43675cd1a4f3716b119d7f Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Mon, 18 Aug 2025 10:28:05 +0530 Subject: ACPI: RISC-V: Add support for RIMT RISC-V IO Mapping Table (RIMT) is a static ACPI table to communicate IOMMU information to the OS. The spec is available at [1]. The changes at high level are, a) Initialize data structures required for IOMMU/device configuration using the data from RIMT. Provide APIs required for device configuration. b) Provide an API for IOMMU drivers to register the fwnode with RIMT data structures. This API will create a fwnode for PCIe IOMMU. [1] - https://github.com/riscv-non-isa/riscv-acpi-rimt Signed-off-by: Sunil V L Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250818045807.763922-2-sunilvl@ventanamicro.com Signed-off-by: Joerg Roedel --- MAINTAINERS | 1 + arch/riscv/Kconfig | 1 + drivers/acpi/Kconfig | 4 + drivers/acpi/riscv/Kconfig | 7 + drivers/acpi/riscv/Makefile | 1 + drivers/acpi/riscv/init.c | 2 + drivers/acpi/riscv/init.h | 1 + drivers/acpi/riscv/rimt.c | 520 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi_rimt.h | 28 +++ 9 files changed, 565 insertions(+) create mode 100644 drivers/acpi/riscv/Kconfig create mode 100644 drivers/acpi/riscv/rimt.c create mode 100644 include/linux/acpi_rimt.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa4..af562bb23bef 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -347,6 +347,7 @@ L: linux-acpi@vger.kernel.org L: linux-riscv@lists.infradead.org S: Maintained F: drivers/acpi/riscv/ +F: include/linux/acpi_rimt.h ACPI PCC(Platform Communication Channel) MAILBOX DRIVER M: Sudeep Holla diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659e..7563f43ec026 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -16,6 +16,7 @@ config RISCV select ACPI_MCFG if (ACPI && PCI) select ACPI_PPTT if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI + select ACPI_RIMT if ACPI select ACPI_SPCR_TABLE if ACPI select ARCH_DMA_DEFAULT_COHERENT select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index b594780a57d7..2cdbd08b30e4 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -547,6 +547,10 @@ if ARM64 source "drivers/acpi/arm64/Kconfig" endif +if RISCV +source "drivers/acpi/riscv/Kconfig" +endif + config ACPI_PPTT bool diff --git a/drivers/acpi/riscv/Kconfig b/drivers/acpi/riscv/Kconfig new file mode 100644 index 000000000000..046296a18d00 --- /dev/null +++ b/drivers/acpi/riscv/Kconfig @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# ACPI Configuration for RISC-V +# + +config ACPI_RIMT + bool diff --git a/drivers/acpi/riscv/Makefile b/drivers/acpi/riscv/Makefile index a96fdf1e2cb8..1284a076fa88 100644 --- a/drivers/acpi/riscv/Makefile +++ b/drivers/acpi/riscv/Makefile @@ -2,3 +2,4 @@ obj-y += rhct.o init.o irq.o obj-$(CONFIG_ACPI_PROCESSOR_IDLE) += cpuidle.o obj-$(CONFIG_ACPI_CPPC_LIB) += cppc.o +obj-$(CONFIG_ACPI_RIMT) += rimt.o diff --git a/drivers/acpi/riscv/init.c b/drivers/acpi/riscv/init.c index 673e4d5dd752..7c00f7995e86 100644 --- a/drivers/acpi/riscv/init.c +++ b/drivers/acpi/riscv/init.c @@ -10,4 +10,6 @@ void __init acpi_arch_init(void) { riscv_acpi_init_gsi_mapping(); + if (IS_ENABLED(CONFIG_ACPI_RIMT)) + riscv_acpi_rimt_init(); } diff --git a/drivers/acpi/riscv/init.h b/drivers/acpi/riscv/init.h index 0b9a07e4031f..1680aa2aaf23 100644 --- a/drivers/acpi/riscv/init.h +++ b/drivers/acpi/riscv/init.h @@ -2,3 +2,4 @@ #include void __init riscv_acpi_init_gsi_mapping(void); +void __init riscv_acpi_rimt_init(void); diff --git a/drivers/acpi/riscv/rimt.c b/drivers/acpi/riscv/rimt.c new file mode 100644 index 000000000000..683fcfe35c31 --- /dev/null +++ b/drivers/acpi/riscv/rimt.c @@ -0,0 +1,520 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2024-2025, Ventana Micro Systems Inc + * Author: Sunil V L + * + */ + +#define pr_fmt(fmt) "ACPI: RIMT: " fmt + +#include +#include +#include +#include +#include +#include +#include "init.h" + +struct rimt_fwnode { + struct list_head list; + struct acpi_rimt_node *rimt_node; + struct fwnode_handle *fwnode; +}; + +static LIST_HEAD(rimt_fwnode_list); +static DEFINE_SPINLOCK(rimt_fwnode_lock); + +#define RIMT_TYPE_MASK(type) (1 << (type)) +#define RIMT_IOMMU_TYPE BIT(0) + +/* Root pointer to the mapped RIMT table */ +static struct acpi_table_header *rimt_table; + +/** + * rimt_set_fwnode() - Create rimt_fwnode and use it to register + * iommu data in the rimt_fwnode_list + * + * @rimt_node: RIMT table node associated with the IOMMU + * @fwnode: fwnode associated with the RIMT node + * + * Returns: 0 on success + * <0 on failure + */ +static int rimt_set_fwnode(struct acpi_rimt_node *rimt_node, + struct fwnode_handle *fwnode) +{ + struct rimt_fwnode *np; + + np = kzalloc(sizeof(*np), GFP_ATOMIC); + + if (WARN_ON(!np)) + return -ENOMEM; + + INIT_LIST_HEAD(&np->list); + np->rimt_node = rimt_node; + np->fwnode = fwnode; + + spin_lock(&rimt_fwnode_lock); + list_add_tail(&np->list, &rimt_fwnode_list); + spin_unlock(&rimt_fwnode_lock); + + return 0; +} + +/** + * rimt_get_fwnode() - Retrieve fwnode associated with an RIMT node + * + * @node: RIMT table node to be looked-up + * + * Returns: fwnode_handle pointer on success, NULL on failure + */ +static struct fwnode_handle *rimt_get_fwnode(struct acpi_rimt_node *node) +{ + struct fwnode_handle *fwnode = NULL; + struct rimt_fwnode *curr; + + spin_lock(&rimt_fwnode_lock); + list_for_each_entry(curr, &rimt_fwnode_list, list) { + if (curr->rimt_node == node) { + fwnode = curr->fwnode; + break; + } + } + spin_unlock(&rimt_fwnode_lock); + + return fwnode; +} + +static acpi_status rimt_match_node_callback(struct acpi_rimt_node *node, + void *context) +{ + acpi_status status = AE_NOT_FOUND; + struct device *dev = context; + + if (node->type == ACPI_RIMT_NODE_TYPE_IOMMU) { + struct acpi_rimt_iommu *iommu_node = (struct acpi_rimt_iommu *)&node->node_data; + + if (dev_is_pci(dev)) { + struct pci_dev *pdev; + u16 bdf; + + pdev = to_pci_dev(dev); + bdf = PCI_DEVID(pdev->bus->number, pdev->devfn); + if ((pci_domain_nr(pdev->bus) == iommu_node->pcie_segment_number) && + bdf == iommu_node->pcie_bdf) { + status = AE_OK; + } else { + status = AE_NOT_FOUND; + } + } else { + struct platform_device *pdev = to_platform_device(dev); + struct resource *res; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (res && res->start == iommu_node->base_address) + status = AE_OK; + else + status = AE_NOT_FOUND; + } + } else if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + struct acpi_rimt_pcie_rc *pci_rc; + struct pci_bus *bus; + + bus = to_pci_bus(dev); + pci_rc = (struct acpi_rimt_pcie_rc *)node->node_data; + + /* + * It is assumed that PCI segment numbers maps one-to-one + * with root complexes. Each segment number can represent only + * one root complex. + */ + status = pci_rc->pcie_segment_number == pci_domain_nr(bus) ? + AE_OK : AE_NOT_FOUND; + } else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) { + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_rimt_platform_device *ncomp; + struct device *plat_dev = dev; + struct acpi_device *adev; + + /* + * Walk the device tree to find a device with an + * ACPI companion; there is no point in scanning + * RIMT for a device matching a platform device if + * the device does not have an ACPI companion to + * start with. + */ + do { + adev = ACPI_COMPANION(plat_dev); + if (adev) + break; + + plat_dev = plat_dev->parent; + } while (plat_dev); + + if (!adev) + return status; + + status = acpi_get_name(adev->handle, ACPI_FULL_PATHNAME, &buf); + if (ACPI_FAILURE(status)) { + dev_warn(plat_dev, "Can't get device full path name\n"); + return status; + } + + ncomp = (struct acpi_rimt_platform_device *)node->node_data; + status = !strcmp(ncomp->device_name, buf.pointer) ? + AE_OK : AE_NOT_FOUND; + acpi_os_free(buf.pointer); + } + + return status; +} + +static struct acpi_rimt_node *rimt_scan_node(enum acpi_rimt_node_type type, + void *context) +{ + struct acpi_rimt_node *rimt_node, *rimt_end; + struct acpi_table_rimt *rimt; + int i; + + if (!rimt_table) + return NULL; + + /* Get the first RIMT node */ + rimt = (struct acpi_table_rimt *)rimt_table; + rimt_node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt, + rimt->node_offset); + rimt_end = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table, + rimt_table->length); + + for (i = 0; i < rimt->num_nodes; i++) { + if (WARN_TAINT(rimt_node >= rimt_end, TAINT_FIRMWARE_WORKAROUND, + "RIMT node pointer overflows, bad table!\n")) + return NULL; + + if (rimt_node->type == type && + ACPI_SUCCESS(rimt_match_node_callback(rimt_node, context))) + return rimt_node; + + rimt_node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_node, + rimt_node->length); + } + + return NULL; +} + +static bool rimt_pcie_rc_supports_ats(struct acpi_rimt_node *node) +{ + struct acpi_rimt_pcie_rc *pci_rc; + + pci_rc = (struct acpi_rimt_pcie_rc *)node->node_data; + return pci_rc->flags & ACPI_RIMT_PCIE_ATS_SUPPORTED; +} + +static int rimt_iommu_xlate(struct device *dev, struct acpi_rimt_node *node, u32 deviceid) +{ + struct fwnode_handle *rimt_fwnode; + + if (!node) + return -ENODEV; + + rimt_fwnode = rimt_get_fwnode(node); + + /* + * The IOMMU drivers may not be probed yet. + * Defer the IOMMU configuration + */ + if (!rimt_fwnode) + return -EPROBE_DEFER; + + return acpi_iommu_fwspec_init(dev, deviceid, rimt_fwnode); +} + +struct rimt_pci_alias_info { + struct device *dev; + struct acpi_rimt_node *node; + const struct iommu_ops *ops; +}; + +static int rimt_id_map(struct acpi_rimt_id_mapping *map, u8 type, u32 rid_in, u32 *rid_out) +{ + if (rid_in < map->source_id_base || + (rid_in > map->source_id_base + map->num_ids)) + return -ENXIO; + + *rid_out = map->dest_id_base + (rid_in - map->source_id_base); + return 0; +} + +static struct acpi_rimt_node *rimt_node_get_id(struct acpi_rimt_node *node, + u32 *id_out, int index) +{ + struct acpi_rimt_platform_device *plat_node; + u32 id_mapping_offset, num_id_mapping; + struct acpi_rimt_pcie_rc *pci_node; + struct acpi_rimt_id_mapping *map; + struct acpi_rimt_node *parent; + + if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + pci_node = (struct acpi_rimt_pcie_rc *)&node->node_data; + id_mapping_offset = pci_node->id_mapping_offset; + num_id_mapping = pci_node->num_id_mappings; + } else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) { + plat_node = (struct acpi_rimt_platform_device *)&node->node_data; + id_mapping_offset = plat_node->id_mapping_offset; + num_id_mapping = plat_node->num_id_mappings; + } else { + return NULL; + } + + if (!id_mapping_offset || !num_id_mapping || index >= num_id_mapping) + return NULL; + + map = ACPI_ADD_PTR(struct acpi_rimt_id_mapping, node, + id_mapping_offset + index * sizeof(*map)); + + /* Firmware bug! */ + if (!map->dest_offset) { + pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n", + node, node->type); + return NULL; + } + + parent = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table, map->dest_offset); + + if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE || + node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + *id_out = map->dest_id_base; + return parent; + } + + return NULL; +} + +/* + * RISC-V supports IOMMU as a PCI device or a platform device. + * When it is a platform device, there should be a namespace device as + * well along with RIMT. To create the link between RIMT information and + * the platform device, the IOMMU driver should register itself with the + * RIMT module. This is true for PCI based IOMMU as well. + */ +int rimt_iommu_register(struct device *dev) +{ + struct fwnode_handle *rimt_fwnode; + struct acpi_rimt_node *node; + + node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_IOMMU, dev); + if (!node) { + pr_err("Could not find IOMMU node in RIMT\n"); + return -ENODEV; + } + + if (dev_is_pci(dev)) { + rimt_fwnode = acpi_alloc_fwnode_static(); + if (!rimt_fwnode) + return -ENOMEM; + + rimt_fwnode->dev = dev; + if (!dev->fwnode) + dev->fwnode = rimt_fwnode; + + rimt_set_fwnode(node, rimt_fwnode); + } else { + rimt_set_fwnode(node, dev->fwnode); + } + + return 0; +} + +#ifdef CONFIG_IOMMU_API + +static struct acpi_rimt_node *rimt_node_map_id(struct acpi_rimt_node *node, + u32 id_in, u32 *id_out, + u8 type_mask) +{ + struct acpi_rimt_platform_device *plat_node; + u32 id_mapping_offset, num_id_mapping; + struct acpi_rimt_pcie_rc *pci_node; + u32 id = id_in; + + /* Parse the ID mapping tree to find specified node type */ + while (node) { + struct acpi_rimt_id_mapping *map; + int i, rc = 0; + u32 map_id = id; + + if (RIMT_TYPE_MASK(node->type) & type_mask) { + if (id_out) + *id_out = id; + return node; + } + + if (node->type == ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX) { + pci_node = (struct acpi_rimt_pcie_rc *)&node->node_data; + id_mapping_offset = pci_node->id_mapping_offset; + num_id_mapping = pci_node->num_id_mappings; + } else if (node->type == ACPI_RIMT_NODE_TYPE_PLAT_DEVICE) { + plat_node = (struct acpi_rimt_platform_device *)&node->node_data; + id_mapping_offset = plat_node->id_mapping_offset; + num_id_mapping = plat_node->num_id_mappings; + } else { + goto fail_map; + } + + if (!id_mapping_offset || !num_id_mapping) + goto fail_map; + + map = ACPI_ADD_PTR(struct acpi_rimt_id_mapping, node, + id_mapping_offset); + + /* Firmware bug! */ + if (!map->dest_offset) { + pr_err(FW_BUG "[node %p type %d] ID map has NULL parent reference\n", + node, node->type); + goto fail_map; + } + + /* Do the ID translation */ + for (i = 0; i < num_id_mapping; i++, map++) { + rc = rimt_id_map(map, node->type, map_id, &id); + if (!rc) + break; + } + + if (i == num_id_mapping) + goto fail_map; + + node = ACPI_ADD_PTR(struct acpi_rimt_node, rimt_table, + rc ? 0 : map->dest_offset); + } + +fail_map: + /* Map input ID to output ID unchanged on mapping failure */ + if (id_out) + *id_out = id_in; + + return NULL; +} + +static struct acpi_rimt_node *rimt_node_map_platform_id(struct acpi_rimt_node *node, u32 *id_out, + u8 type_mask, int index) +{ + struct acpi_rimt_node *parent; + u32 id; + + parent = rimt_node_get_id(node, &id, index); + if (!parent) + return NULL; + + if (!(RIMT_TYPE_MASK(parent->type) & type_mask)) + parent = rimt_node_map_id(parent, id, id_out, type_mask); + else + if (id_out) + *id_out = id; + + return parent; +} + +static int rimt_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) +{ + struct rimt_pci_alias_info *info = data; + struct acpi_rimt_node *parent; + u32 deviceid; + + parent = rimt_node_map_id(info->node, alias, &deviceid, RIMT_IOMMU_TYPE); + return rimt_iommu_xlate(info->dev, parent, deviceid); +} + +static int rimt_plat_iommu_map(struct device *dev, struct acpi_rimt_node *node) +{ + struct acpi_rimt_node *parent; + int err = -ENODEV, i = 0; + u32 deviceid = 0; + + do { + parent = rimt_node_map_platform_id(node, &deviceid, + RIMT_IOMMU_TYPE, + i++); + + if (parent) + err = rimt_iommu_xlate(dev, parent, deviceid); + } while (parent && !err); + + return err; +} + +static int rimt_plat_iommu_map_id(struct device *dev, + struct acpi_rimt_node *node, + const u32 *in_id) +{ + struct acpi_rimt_node *parent; + u32 deviceid; + + parent = rimt_node_map_id(node, *in_id, &deviceid, RIMT_IOMMU_TYPE); + if (parent) + return rimt_iommu_xlate(dev, parent, deviceid); + + return -ENODEV; +} + +/** + * rimt_iommu_configure_id - Set-up IOMMU configuration for a device. + * + * @dev: device to configure + * @id_in: optional input id const value pointer + * + * Returns: 0 on success, <0 on failure + */ +int rimt_iommu_configure_id(struct device *dev, const u32 *id_in) +{ + struct acpi_rimt_node *node; + int err = -ENODEV; + + if (dev_is_pci(dev)) { + struct iommu_fwspec *fwspec; + struct pci_bus *bus = to_pci_dev(dev)->bus; + struct rimt_pci_alias_info info = { .dev = dev }; + + node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_PCIE_ROOT_COMPLEX, &bus->dev); + if (!node) + return -ENODEV; + + info.node = node; + err = pci_for_each_dma_alias(to_pci_dev(dev), + rimt_pci_iommu_init, &info); + + fwspec = dev_iommu_fwspec_get(dev); + if (fwspec && rimt_pcie_rc_supports_ats(node)) + fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS; + } else { + node = rimt_scan_node(ACPI_RIMT_NODE_TYPE_PLAT_DEVICE, dev); + if (!node) + return -ENODEV; + + err = id_in ? rimt_plat_iommu_map_id(dev, node, id_in) : + rimt_plat_iommu_map(dev, node); + } + + return err; +} + +#endif + +void __init riscv_acpi_rimt_init(void) +{ + acpi_status status; + + /* rimt_table will be used at runtime after the rimt init, + * so we don't need to call acpi_put_table() to release + * the RIMT table mapping. + */ + status = acpi_get_table(ACPI_SIG_RIMT, 0, &rimt_table); + if (ACPI_FAILURE(status)) { + if (status != AE_NOT_FOUND) { + const char *msg = acpi_format_exception(status); + + pr_err("Failed to get table, %s\n", msg); + } + + return; + } +} diff --git a/include/linux/acpi_rimt.h b/include/linux/acpi_rimt.h new file mode 100644 index 000000000000..fad3adc4d899 --- /dev/null +++ b/include/linux/acpi_rimt.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024-2025, Ventana Micro Systems Inc. + * Author: Sunil V L + */ + +#ifndef _ACPI_RIMT_H +#define _ACPI_RIMT_H + +#ifdef CONFIG_ACPI_RIMT +int rimt_iommu_register(struct device *dev); +#else +static inline int rimt_iommu_register(struct device *dev) +{ + return -ENODEV; +} +#endif + +#if defined(CONFIG_IOMMU_API) && defined(CONFIG_ACPI_RIMT) +int rimt_iommu_configure_id(struct device *dev, const u32 *id_in); +#else +static inline int rimt_iommu_configure_id(struct device *dev, const u32 *id_in) +{ + return -ENODEV; +} +#endif + +#endif /* _ACPI_RIMT_H */ -- cgit v1.2.3 From f6cfa602d2ba7e5ca9dc65ec4141521aca80bda2 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:13:23 +0200 Subject: workqueue: replace use of system_unbound_wq with system_dfl_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. queue_work() / queue_delayed_work() / mod_delayed_work() will now use the new unbound wq: whether the user still use the old wq a warn will be printed along with a wq redirect to the new one. The old system_unbound_wq will be kept for a few release cycles. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 4 ++-- kernel/workqueue.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 45d5dd470ff6..af860e8f8481 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -783,8 +783,8 @@ extern void __warn_flushing_systemwide_wq(void) _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ - (__builtin_constant_p(_wq == system_unbound_wq) && \ - _wq == system_unbound_wq) || \ + (__builtin_constant_p(_wq == system_dfl_wq) && \ + _wq == system_dfl_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ _wq == system_freezable_wq) || \ (__builtin_constant_p(_wq == system_power_efficient_wq) && \ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 59faf857ee4f..2888f4399acd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2932,7 +2932,7 @@ static void idle_worker_timeout(struct timer_list *t) raw_spin_unlock_irq(&pool->lock); if (do_cull) - queue_work(system_unbound_wq, &pool->idle_cull_work); + queue_work(system_dfl_wq, &pool->idle_cull_work); } /** -- cgit v1.2.3 From a2be943b46b4a7478ea8ddf9bb8e5251c59fceb7 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:13:24 +0200 Subject: workqueue: replace use of system_wq with system_percpu_wq Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by adding a system_percpu_wq. queue_work() / queue_delayed_work() mod_delayed_work() will now use the new per-cpu wq: whether the user still stick on the old name a warn will be printed along a wq redirect to the new one. This patch add the new system_percpu_wq except for mm, fs and net subsystem, whom are handled in separated patches. The old wq will be kept for a few release cylces. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 22 +++++++++++----------- kernel/workqueue.c | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index af860e8f8481..b6834b7aee4b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -434,10 +434,10 @@ enum wq_consts { * short queue flush time. Don't queue works which can run for too * long. * - * system_highpri_wq is similar to system_wq but for work items which + * system_highpri_wq is similar to system_percpu_wq but for work items which * require WQ_HIGHPRI. * - * system_long_wq is similar to system_wq but may host long running + * system_long_wq is similar to system_percpu_wq but may host long running * works. Queue flushing might take relatively long. * * system_dfl_wq is unbound workqueue. Workers are not bound to @@ -445,13 +445,13 @@ enum wq_consts { * executed immediately as long as max_active limit is not reached and * resources are available. * - * system_freezable_wq is equivalent to system_wq except that it's + * system_freezable_wq is equivalent to system_percpu_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. - * system_power_efficient_wq is identical to system_wq if + * system_power_efficient_wq is identical to system_percpu_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. * * system_bh[_highpri]_wq are convenience interface to softirq. BH work items @@ -708,7 +708,7 @@ static inline bool mod_delayed_work(struct workqueue_struct *wq, */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { - return queue_work_on(cpu, system_wq, work); + return queue_work_on(cpu, system_percpu_wq, work); } /** @@ -727,7 +727,7 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work) */ static inline bool schedule_work(struct work_struct *work) { - return queue_work(system_wq, work); + return queue_work(system_percpu_wq, work); } /** @@ -770,15 +770,15 @@ extern void __warn_flushing_systemwide_wq(void) #define flush_scheduled_work() \ ({ \ __warn_flushing_systemwide_wq(); \ - __flush_workqueue(system_wq); \ + __flush_workqueue(system_percpu_wq); \ }) #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ \ - if ((__builtin_constant_p(_wq == system_wq) && \ - _wq == system_wq) || \ + if ((__builtin_constant_p(_wq == system_percpu_wq) && \ + _wq == system_percpu_wq) || \ (__builtin_constant_p(_wq == system_highpri_wq) && \ _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ @@ -807,7 +807,7 @@ extern void __warn_flushing_systemwide_wq(void) static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, system_wq, dwork, delay); + return queue_delayed_work_on(cpu, system_percpu_wq, dwork, delay); } /** @@ -821,7 +821,7 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(system_wq, dwork, delay); + return queue_delayed_work(system_percpu_wq, dwork, delay); } #ifndef CONFIG_SMP diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2888f4399acd..90db8cf015c2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7668,7 +7668,7 @@ static int wq_watchdog_param_set_thresh(const char *val, if (ret) return ret; - if (system_wq) + if (system_percpu_wq) wq_watchdog_set_thresh(thresh); else wq_watchdog_thresh = thresh; -- cgit v1.2.3 From 97248d05b70edc674f2f2fa835fed33172686b1d Mon Sep 17 00:00:00 2001 From: Zihuan Zhang Date: Tue, 2 Sep 2025 15:33:23 +0800 Subject: cpufreq: Drop redundant freq_table parameter Since commit e0b3165ba521 ("cpufreq: add 'freq_table' in struct cpufreq_policy"), freq_table has been stored in struct cpufreq_policy instead of being maintained separately. However, several helpers in freq_table.c still take both policy and freq_table as parameters, even though policy->freq_table can always be used. This leads to redundant function arguments and increases the chance of inconsistencies. This patch removes the unnecessary freq_table argument from these functions and updates their callers to only pass policy. This makes the code simpler, more consistent, and avoids duplication. Signed-off-by: Zihuan Zhang Acked-by: Viresh Kumar Link: https://patch.msgid.link/20250902073323.48330-1-zhangzihuan@kylinos.cn Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 2 +- drivers/cpufreq/freq_table.c | 14 ++++++-------- drivers/cpufreq/sh-cpufreq.c | 6 ++---- drivers/cpufreq/virtual-cpufreq.c | 2 +- include/linux/cpufreq.h | 7 +++---- 5 files changed, 13 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a615c98d80ca..5fcc99f768d2 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2793,7 +2793,7 @@ int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) if (!policy->freq_table) return -ENXIO; - ret = cpufreq_frequency_table_cpuinfo(policy, policy->freq_table); + ret = cpufreq_frequency_table_cpuinfo(policy); if (ret) { pr_err("%s: Policy frequency update failed\n", __func__); return ret; diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index 35de513af6c9..d5111ee56e38 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -28,10 +28,9 @@ static bool policy_has_boost_freq(struct cpufreq_policy *policy) return false; } -int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, - struct cpufreq_frequency_table *table) +int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy) { - struct cpufreq_frequency_table *pos; + struct cpufreq_frequency_table *pos, *table = policy->freq_table; unsigned int min_freq = ~0; unsigned int max_freq = 0; unsigned int freq; @@ -65,10 +64,9 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, return 0; } -int cpufreq_frequency_table_verify(struct cpufreq_policy_data *policy, - struct cpufreq_frequency_table *table) +int cpufreq_frequency_table_verify(struct cpufreq_policy_data *policy) { - struct cpufreq_frequency_table *pos; + struct cpufreq_frequency_table *pos, *table = policy->freq_table; unsigned int freq, prev_smaller = 0; bool found = false; @@ -110,7 +108,7 @@ int cpufreq_generic_frequency_table_verify(struct cpufreq_policy_data *policy) if (!policy->freq_table) return -ENODEV; - return cpufreq_frequency_table_verify(policy, policy->freq_table); + return cpufreq_frequency_table_verify(policy); } EXPORT_SYMBOL_GPL(cpufreq_generic_frequency_table_verify); @@ -354,7 +352,7 @@ int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy) return 0; } - ret = cpufreq_frequency_table_cpuinfo(policy, policy->freq_table); + ret = cpufreq_frequency_table_cpuinfo(policy); if (ret) return ret; diff --git a/drivers/cpufreq/sh-cpufreq.c b/drivers/cpufreq/sh-cpufreq.c index 9c0b01e00508..642ddb9ea217 100644 --- a/drivers/cpufreq/sh-cpufreq.c +++ b/drivers/cpufreq/sh-cpufreq.c @@ -89,11 +89,9 @@ static int sh_cpufreq_target(struct cpufreq_policy *policy, static int sh_cpufreq_verify(struct cpufreq_policy_data *policy) { struct clk *cpuclk = &per_cpu(sh_cpuclk, policy->cpu); - struct cpufreq_frequency_table *freq_table; - freq_table = cpuclk->nr_freqs ? cpuclk->freq_table : NULL; - if (freq_table) - return cpufreq_frequency_table_verify(policy, freq_table); + if (policy->freq_table) + return cpufreq_frequency_table_verify(policy); cpufreq_verify_within_cpu_limits(policy); diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c index 7dd1b0c263c7..6ffa16d239b2 100644 --- a/drivers/cpufreq/virtual-cpufreq.c +++ b/drivers/cpufreq/virtual-cpufreq.c @@ -250,7 +250,7 @@ static int virt_cpufreq_offline(struct cpufreq_policy *policy) static int virt_cpufreq_verify_policy(struct cpufreq_policy_data *policy) { if (policy->freq_table) - return cpufreq_frequency_table_verify(policy, policy->freq_table); + return cpufreq_frequency_table_verify(policy); cpufreq_verify_within_cpu_limits(policy); return 0; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 95f3807c8c55..40966512ea18 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -780,11 +780,10 @@ struct cpufreq_frequency_table { else -int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, - struct cpufreq_frequency_table *table); +int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy); + +int cpufreq_frequency_table_verify(struct cpufreq_policy_data *policy); -int cpufreq_frequency_table_verify(struct cpufreq_policy_data *policy, - struct cpufreq_frequency_table *table); int cpufreq_generic_frequency_table_verify(struct cpufreq_policy_data *policy); int cpufreq_table_index_unsorted(struct cpufreq_policy *policy, -- cgit v1.2.3 From 1544344563376b2a2ae2af5af1db00d6410c18e0 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 29 Aug 2025 15:28:44 +0800 Subject: rhashtable: Use __always_inline instead of inline Sometimes, the compiler is not clever enough to inline the rhashtable_lookup() for us, even if the "obj_cmpfn" and "key_len" in params is const. This can introduce more overhead. Therefore, use __always_inline for the rhashtable. Signed-off-by: Menglong Dong Signed-off-by: Herbert Xu --- include/linux/rhashtable.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 6c85b28ea30b..e740157f3cd7 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -122,7 +122,7 @@ static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, return hash & (tbl->size - 1); } -static inline unsigned int rht_key_get_hash(struct rhashtable *ht, +static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht, const void *key, const struct rhashtable_params params, unsigned int hash_rnd) { @@ -152,7 +152,7 @@ static inline unsigned int rht_key_get_hash(struct rhashtable *ht, return hash; } -static inline unsigned int rht_key_hashfn( +static __always_inline unsigned int rht_key_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const void *key, const struct rhashtable_params params) { @@ -161,7 +161,7 @@ static inline unsigned int rht_key_hashfn( return rht_bucket_index(tbl, hash); } -static inline unsigned int rht_head_hashfn( +static __always_inline unsigned int rht_head_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he, const struct rhashtable_params params) { @@ -586,7 +586,7 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, } /* Internal function, do not use. */ -static inline struct rhash_head *__rhashtable_lookup( +static __always_inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { @@ -639,7 +639,7 @@ restart: * * Returns the first entry on which the compare function returned true. */ -static inline void *rhashtable_lookup( +static __always_inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { @@ -662,7 +662,7 @@ static inline void *rhashtable_lookup( * * Returns the first entry on which the compare function returned true. */ -static inline void *rhashtable_lookup_fast( +static __always_inline void *rhashtable_lookup_fast( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { @@ -689,7 +689,7 @@ static inline void *rhashtable_lookup_fast( * * Returns the list of entries that match the given key. */ -static inline struct rhlist_head *rhltable_lookup( +static __always_inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { @@ -702,7 +702,7 @@ static inline struct rhlist_head *rhltable_lookup( * function returns the existing element already in hashes if there is a clash, * otherwise it returns an error via ERR_PTR(). */ -static inline void *__rhashtable_insert_fast( +static __always_inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { @@ -825,7 +825,7 @@ out_unlock: * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ -static inline int rhashtable_insert_fast( +static __always_inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { @@ -854,7 +854,7 @@ static inline int rhashtable_insert_fast( * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ -static inline int rhltable_insert_key( +static __always_inline int rhltable_insert_key( struct rhltable *hlt, const void *key, struct rhlist_head *list, const struct rhashtable_params params) { @@ -877,7 +877,7 @@ static inline int rhltable_insert_key( * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ -static inline int rhltable_insert( +static __always_inline int rhltable_insert( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { @@ -902,7 +902,7 @@ static inline int rhltable_insert( * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ -static inline int rhashtable_lookup_insert_fast( +static __always_inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { @@ -929,7 +929,7 @@ static inline int rhashtable_lookup_insert_fast( * object if it exists, NULL if it did not and the insertion was successful, * and an ERR_PTR otherwise. */ -static inline void *rhashtable_lookup_get_insert_fast( +static __always_inline void *rhashtable_lookup_get_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { @@ -956,7 +956,7 @@ static inline void *rhashtable_lookup_get_insert_fast( * * Returns zero on success. */ -static inline int rhashtable_lookup_insert_key( +static __always_inline int rhashtable_lookup_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { @@ -982,7 +982,7 @@ static inline int rhashtable_lookup_insert_key( * object if it exists, NULL if it does not and the insertion was successful, * and an ERR_PTR otherwise. */ -static inline void *rhashtable_lookup_get_insert_key( +static __always_inline void *rhashtable_lookup_get_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { @@ -992,7 +992,7 @@ static inline void *rhashtable_lookup_get_insert_key( } /* Internal function, please use rhashtable_remove_fast() instead */ -static inline int __rhashtable_remove_fast_one( +static __always_inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) @@ -1074,7 +1074,7 @@ unlocked: } /* Internal function, please use rhashtable_remove_fast() instead */ -static inline int __rhashtable_remove_fast( +static __always_inline int __rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { @@ -1115,7 +1115,7 @@ static inline int __rhashtable_remove_fast( * * Returns zero on success, -ENOENT if the entry could not be found. */ -static inline int rhashtable_remove_fast( +static __always_inline int rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { @@ -1137,7 +1137,7 @@ static inline int rhashtable_remove_fast( * * Returns zero on success, -ENOENT if the entry could not be found. */ -static inline int rhltable_remove( +static __always_inline int rhltable_remove( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { @@ -1145,7 +1145,7 @@ static inline int rhltable_remove( } /* Internal function, please use rhashtable_replace_fast() instead */ -static inline int __rhashtable_replace_fast( +static __always_inline int __rhashtable_replace_fast( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) @@ -1208,7 +1208,7 @@ unlocked: * Returns zero on success, -ENOENT if the entry could not be found, * -EINVAL if hash is not the same for the old and new objects. */ -static inline int rhashtable_replace_fast( +static __always_inline int rhashtable_replace_fast( struct rhashtable *ht, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) -- cgit v1.2.3 From 886d6981208263b55a1eb8b39c5d00db1544b9bb Mon Sep 17 00:00:00 2001 From: Zhushuai Yin Date: Sat, 30 Aug 2025 18:27:57 +0800 Subject: crypto: hisilicon/zip - add hashjoin, gather, and UDMA data move features The new version of the hisilicon zip driver supports the hash join and gather features, as well as the data move feature (UDMA), including data copying and memory initialization functions.These features are registered to the uacce subsystem. Signed-off-by: Zhushuai Yin Signed-off-by: Chenghai Huang Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/qm.c | 29 ++++++++++++++++++++++------- drivers/crypto/hisilicon/zip/dae_main.c | 11 +++++++++-- include/linux/hisi_acc_qm.h | 1 + include/uapi/misc/uacce/hisi_qm.h | 1 + 4 files changed, 33 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index 102aff9ea19a..98099117bb38 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -2742,6 +2742,27 @@ static void qm_remove_uacce(struct hisi_qm *qm) } } +static void qm_uacce_api_ver_init(struct hisi_qm *qm) +{ + struct uacce_device *uacce = qm->uacce; + + switch (qm->ver) { + case QM_HW_V1: + uacce->api_ver = HISI_QM_API_VER_BASE; + break; + case QM_HW_V2: + uacce->api_ver = HISI_QM_API_VER2_BASE; + break; + case QM_HW_V3: + case QM_HW_V4: + uacce->api_ver = HISI_QM_API_VER3_BASE; + break; + default: + uacce->api_ver = HISI_QM_API_VER5_BASE; + break; + } +} + static int qm_alloc_uacce(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; @@ -2775,13 +2796,6 @@ static int qm_alloc_uacce(struct hisi_qm *qm) uacce->is_vf = pdev->is_virtfn; uacce->priv = qm; - if (qm->ver == QM_HW_V1) - uacce->api_ver = HISI_QM_API_VER_BASE; - else if (qm->ver == QM_HW_V2) - uacce->api_ver = HISI_QM_API_VER2_BASE; - else - uacce->api_ver = HISI_QM_API_VER3_BASE; - if (qm->ver == QM_HW_V1) mmio_page_nr = QM_DOORBELL_PAGE_NR; else if (!test_bit(QM_SUPPORT_DB_ISOLATION, &qm->caps)) @@ -2801,6 +2815,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm) uacce->qf_pg_num[UACCE_QFRT_DUS] = dus_page_nr; qm->uacce = uacce; + qm_uacce_api_ver_init(qm); INIT_LIST_HEAD(&qm->isolate_data.qm_hw_errs); mutex_init(&qm->isolate_data.isolate_lock); diff --git a/drivers/crypto/hisilicon/zip/dae_main.c b/drivers/crypto/hisilicon/zip/dae_main.c index 6f22e4c36e49..4c5481c77436 100644 --- a/drivers/crypto/hisilicon/zip/dae_main.c +++ b/drivers/crypto/hisilicon/zip/dae_main.c @@ -15,6 +15,7 @@ #define DAE_REG_RD_TMOUT_US USEC_PER_SEC #define DAE_ALG_NAME "hashagg" +#define DAE_V5_ALG_NAME "hashagg\nudma\nhashjoin\ngather" /* error */ #define DAE_AXI_CFG_OFFSET 0x331000 @@ -82,6 +83,7 @@ int hisi_dae_set_user_domain(struct hisi_qm *qm) int hisi_dae_set_alg(struct hisi_qm *qm) { + const char *alg_name; size_t len; if (!dae_is_support(qm)) @@ -90,9 +92,14 @@ int hisi_dae_set_alg(struct hisi_qm *qm) if (!qm->uacce) return 0; + if (qm->ver >= QM_HW_V5) + alg_name = DAE_V5_ALG_NAME; + else + alg_name = DAE_ALG_NAME; + len = strlen(qm->uacce->algs); /* A line break may be required */ - if (len + strlen(DAE_ALG_NAME) + 1 >= QM_DEV_ALG_MAX_LEN) { + if (len + strlen(alg_name) + 1 >= QM_DEV_ALG_MAX_LEN) { pci_err(qm->pdev, "algorithm name is too long!\n"); return -EINVAL; } @@ -100,7 +107,7 @@ int hisi_dae_set_alg(struct hisi_qm *qm) if (len) strcat((char *)qm->uacce->algs, "\n"); - strcat((char *)qm->uacce->algs, DAE_ALG_NAME); + strcat((char *)qm->uacce->algs, alg_name); return 0; } diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index 0c4c84b8c3be..f2254ddc327c 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -125,6 +125,7 @@ enum qm_hw_ver { QM_HW_V2 = 0x21, QM_HW_V3 = 0x30, QM_HW_V4 = 0x50, + QM_HW_V5 = 0x51, }; enum qm_fun_type { diff --git a/include/uapi/misc/uacce/hisi_qm.h b/include/uapi/misc/uacce/hisi_qm.h index 3e66dbc2f323..10504b48eabf 100644 --- a/include/uapi/misc/uacce/hisi_qm.h +++ b/include/uapi/misc/uacce/hisi_qm.h @@ -31,6 +31,7 @@ struct hisi_qp_info { #define HISI_QM_API_VER_BASE "hisi_qm_v1" #define HISI_QM_API_VER2_BASE "hisi_qm_v2" #define HISI_QM_API_VER3_BASE "hisi_qm_v3" +#define HISI_QM_API_VER5_BASE "hisi_qm_v5" /* UACCE_CMD_QM_SET_QP_CTX: Set qp algorithm type */ #define UACCE_CMD_QM_SET_QP_CTX _IOWR('H', 10, struct hisi_qp_ctx) -- cgit v1.2.3 From c2ce2453413d429e302659abc5ace634e873f6f5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Aug 2025 12:59:24 +0200 Subject: driver core/PM: Set power.no_callbacks along with power.no_pm Devices with power.no_pm set are not expected to need any power management at all, so modify device_set_pm_not_required() to set power.no_callbacks for them too in case runtime PM will be enabled for any of them (which in principle may be done for convenience if such a device participates in a dependency chain). Since device_set_pm_not_required() must be called before device_add() or it would not have any effect, it can update power.no_callbacks without locking, unlike pm_runtime_no_callbacks() that can be called after registering the target device. Signed-off-by: Rafael J. Wysocki Cc: stable Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/1950054.tdWV9SEqCh@rafael.j.wysocki Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 0470d19da7f2..b031ff71a5bd 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -851,6 +851,9 @@ static inline bool device_pm_not_required(struct device *dev) static inline void device_set_pm_not_required(struct device *dev) { dev->power.no_pm = true; +#ifdef CONFIG_PM + dev->power.no_callbacks = true; +#endif } static inline void dev_pm_syscore_device(struct device *dev, bool val) -- cgit v1.2.3 From 20f988320d2718ef28b1f0635acc88c12a216d29 Mon Sep 17 00:00:00 2001 From: "Rai, Amardeep" Date: Wed, 20 Aug 2025 17:38:19 +0300 Subject: usb: core: Add a function to get USB version independent periodic payload Add usb_endpoint_max_periodic_payload() to obtain maximum payload bytes in a service interval for isochronous and interrupt endpoints in a USB version independent way. Signed-off-by: Rai, Amardeep Signed-off-by: Mathias Nyman Co-developed-by: Sakari Ailus Signed-off-by: Sakari Ailus Reviewed-by: Hans de Goede Acked-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250820143824.551777-5-sakari.ailus@linux.intel.com --- drivers/usb/core/usb.c | 29 +++++++++++++++++++++++++++++ include/linux/usb.h | 3 +++ 2 files changed, 32 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index fca7735fc660..ca9ff6ad8e73 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -1110,6 +1110,35 @@ void usb_free_noncoherent(struct usb_device *dev, size_t size, } EXPORT_SYMBOL_GPL(usb_free_noncoherent); +/** + * usb_endpoint_max_periodic_payload - Get maximum payload bytes per service + * interval + * @udev: The USB device + * @ep: The endpoint + * + * Returns: the maximum number of bytes isochronous or interrupt endpoint @ep + * can transfer during a service interval, or 0 for other endpoints. + */ +u32 usb_endpoint_max_periodic_payload(struct usb_device *udev, + const struct usb_host_endpoint *ep) +{ + if (!usb_endpoint_xfer_isoc(&ep->desc) && + !usb_endpoint_xfer_int(&ep->desc)) + return 0; + + switch (udev->speed) { + case USB_SPEED_SUPER_PLUS: + if (USB_SS_SSP_ISOC_COMP(ep->ss_ep_comp.bmAttributes)) + return le32_to_cpu(ep->ssp_isoc_ep_comp.dwBytesPerInterval); + fallthrough; + case USB_SPEED_SUPER: + return le16_to_cpu(ep->ss_ep_comp.wBytesPerInterval); + default: + return usb_endpoint_maxp(&ep->desc) * usb_endpoint_maxp_mult(&ep->desc); + } +} +EXPORT_SYMBOL_GPL(usb_endpoint_max_periodic_payload); + /* * Notifications of device and interface registration */ diff --git a/include/linux/usb.h b/include/linux/usb.h index 9d662c6abb4d..e9cf2786d8bd 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -2039,6 +2039,9 @@ static inline u16 usb_maxpacket(struct usb_device *udev, int pipe) return usb_endpoint_maxp(&ep->desc); } +u32 usb_endpoint_max_periodic_payload(struct usb_device *udev, + const struct usb_host_endpoint *ep); + /* translate USB error codes to codes user space understands */ static inline int usb_translate_errors(int error_code) { -- cgit v1.2.3 From d6725169a9bbcb5bd1dd14b2891b874614c59f52 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 20 Aug 2025 17:38:21 +0300 Subject: usb: core: Introduce usb_endpoint_is_hs_isoc_double() Introduce usb_endpoint_is_hs_isoc_double() tell whether an endpoint conforms to USB 2.0 Isochronous Double IN Bandwidth ECN. Signed-off-by: Sakari Ailus Acked-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250820143824.551777-7-sakari.ailus@linux.intel.com --- drivers/usb/core/usb.c | 19 +++++++++++++++++++ include/linux/usb.h | 3 +++ 2 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index ca9ff6ad8e73..939dc4aafb89 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -1139,6 +1139,25 @@ u32 usb_endpoint_max_periodic_payload(struct usb_device *udev, } EXPORT_SYMBOL_GPL(usb_endpoint_max_periodic_payload); +/** + * usb_endpoint_is_hs_isoc_double - Tell whether an endpoint uses USB 2 + * Isochronous Double IN Bandwidth + * @udev: The USB device + * @ep: The endpoint + * + * Returns: true if an endpoint @ep conforms to USB 2 Isochronous Double IN + * Bandwidth ECN, false otherwise. + */ +bool usb_endpoint_is_hs_isoc_double(struct usb_device *udev, + const struct usb_host_endpoint *ep) +{ + return ep->eusb2_isoc_ep_comp.bDescriptorType && + le16_to_cpu(udev->descriptor.bcdUSB) == 0x220 && + usb_endpoint_is_isoc_in(&ep->desc) && + !le16_to_cpu(ep->desc.wMaxPacketSize); +} +EXPORT_SYMBOL_GPL(usb_endpoint_is_hs_isoc_double); + /* * Notifications of device and interface registration */ diff --git a/include/linux/usb.h b/include/linux/usb.h index e9cf2786d8bd..70ef00c42d22 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -2042,6 +2042,9 @@ static inline u16 usb_maxpacket(struct usb_device *udev, int pipe) u32 usb_endpoint_max_periodic_payload(struct usb_device *udev, const struct usb_host_endpoint *ep); +bool usb_endpoint_is_hs_isoc_double(struct usb_device *udev, + const struct usb_host_endpoint *ep); + /* translate USB error codes to codes user space understands */ static inline int usb_translate_errors(int error_code) { -- cgit v1.2.3 From 23743ba64709a9c137c1b928f8b8e00d846af9cc Mon Sep 17 00:00:00 2001 From: Calixte Pernot Date: Mon, 25 Aug 2025 14:56:09 +0200 Subject: vt: add support for smput/rmput escape codes Support "\e[?1049h" and "\e[?1049l" escape codes. This patch allows programs to enter and leave alternate screens. This feature is widely available in graphical terminal emulators and mostly used by fullscreen terminal-based user interfaces such as text editors. Most editors such as vim and nano assume this escape code in not supported and will not try to print the escape sequence if TERM=linux. To try out this patch, run `TERM=xterm-256color vim` inside a VT. Signed-off-by: Calixte Pernot Link: https://lore.kernel.org/r/20250825125607.2478-3-calixte.pernot@grenoble-inp.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 58 ++++++++++++++++++++++++++++++++++++++++++ include/linux/console_struct.h | 3 +++ 2 files changed, 61 insertions(+) (limited to 'include/linux') diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 869261141535..3f80e98e5c51 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -141,6 +141,7 @@ static const struct consw *con_driver_map[MAX_NR_CONSOLES]; static int con_open(struct tty_struct *, struct file *); static void vc_init(struct vc_data *vc, int do_clear); static void gotoxy(struct vc_data *vc, int new_x, int new_y); +static void restore_cur(struct vc_data *vc); static void save_cur(struct vc_data *vc); static void reset_terminal(struct vc_data *vc, int do_clear); static void con_flush_chars(struct tty_struct *tty); @@ -1341,6 +1342,10 @@ struct vc_data *vc_deallocate(unsigned int currcons) kfree(vc->vc_screenbuf); vc_cons[currcons].d = NULL; } + if (vc->vc_saved_screen != NULL) { + kfree(vc->vc_saved_screen); + vc->vc_saved_screen = NULL; + } return vc; } @@ -1875,6 +1880,45 @@ static int get_bracketed_paste(struct tty_struct *tty) return vc->vc_bracketed_paste; } +/* console_lock is held */ +static void enter_alt_screen(struct vc_data *vc) +{ + unsigned int size = vc->vc_rows * vc->vc_cols * 2; + + if (vc->vc_saved_screen != NULL) + return; /* Already inside an alt-screen */ + vc->vc_saved_screen = kmemdup((u16 *)vc->vc_origin, size, GFP_KERNEL); + if (vc->vc_saved_screen == NULL) + return; + vc->vc_saved_rows = vc->vc_rows; + vc->vc_saved_cols = vc->vc_cols; + save_cur(vc); + /* clear entire screen */ + csi_J(vc, CSI_J_FULL); +} + +/* console_lock is held */ +static void leave_alt_screen(struct vc_data *vc) +{ + unsigned int rows = min(vc->vc_saved_rows, vc->vc_rows); + unsigned int cols = min(vc->vc_saved_cols, vc->vc_cols); + u16 *src, *dest; + + if (vc->vc_saved_screen == NULL) + return; /* Not inside an alt-screen */ + for (unsigned int r = 0; r < rows; r++) { + src = vc->vc_saved_screen + r * vc->vc_saved_cols; + dest = ((u16 *)vc->vc_origin) + r * vc->vc_cols; + memcpy(dest, src, 2 * cols); + } + restore_cur(vc); + /* Update the entire screen */ + if (con_should_update(vc)) + do_update_region(vc, vc->vc_origin, vc->vc_screenbuf_size / 2); + kfree(vc->vc_saved_screen); + vc->vc_saved_screen = NULL; +} + enum { CSI_DEC_hl_CURSOR_KEYS = 1, /* CKM: cursor keys send ^[Ox/^[[x */ CSI_DEC_hl_132_COLUMNS = 3, /* COLM: 80/132 mode switch */ @@ -1885,6 +1929,7 @@ enum { CSI_DEC_hl_MOUSE_X10 = 9, CSI_DEC_hl_SHOW_CURSOR = 25, /* TCEM */ CSI_DEC_hl_MOUSE_VT200 = 1000, + CSI_DEC_hl_ALT_SCREEN = 1049, CSI_DEC_hl_BRACKETED_PASTE = 2004, }; @@ -1941,6 +1986,12 @@ static void csi_DEC_hl(struct vc_data *vc, bool on_off) case CSI_DEC_hl_BRACKETED_PASTE: vc->vc_bracketed_paste = on_off; break; + case CSI_DEC_hl_ALT_SCREEN: + if (on_off) + enter_alt_screen(vc); + else + leave_alt_screen(vc); + break; } } @@ -2179,6 +2230,13 @@ static void reset_terminal(struct vc_data *vc, int do_clear) vc->vc_deccm = global_cursor_default; vc->vc_decim = 0; + if (vc->vc_saved_screen != NULL) { + kfree(vc->vc_saved_screen); + vc->vc_saved_screen = NULL; + vc->vc_saved_rows = 0; + vc->vc_saved_cols = 0; + } + vt_reset_keyboard(vc->vc_num); vc->vc_cursor_type = cur_default; diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 59b4fec5f254..13b35637bd5a 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -159,6 +159,9 @@ struct vc_data { struct uni_pagedict *uni_pagedict; struct uni_pagedict **uni_pagedict_loc; /* [!] Location of uni_pagedict variable for this console */ u32 **vc_uni_lines; /* unicode screen content */ + u16 *vc_saved_screen; + unsigned int vc_saved_cols; + unsigned int vc_saved_rows; /* additional information is in vt_kern.h */ }; -- cgit v1.2.3 From 0bcd01f757bc06471c82a137eafee281ef1b6e38 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 28 Aug 2024 21:56:57 -0700 Subject: hwmon: Introduce 64-bit energy attribute support Many chips require 64-bit variables to display the accumulated energy, even more so since the energy units are micro-Joule. Add new sensor type "energy64" to support reporting the chip energy as 64-bit values. Changing the entire hardware monitoring API is not feasible, and it is only really necessary to support reading 64-bit values for the "energyX_input" attribute. For this reason, keep the API as-is and use type casts on both ends to pass 64-bit pointers when reading the accumulated energy. On the write side (which is only useful for the energyX_enable attribute), keep passing the written value as long. Reviewed-by: Chris Packham Tested-by: Chris Packham # INA780 Signed-off-by: Guenter Roeck --- Documentation/hwmon/hwmon-kernel-api.rst | 3 +++ drivers/hwmon/hwmon.c | 14 ++++++++++---- include/linux/hwmon.h | 1 + include/trace/events/hwmon.h | 10 +++++----- 4 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/Documentation/hwmon/hwmon-kernel-api.rst b/Documentation/hwmon/hwmon-kernel-api.rst index e47fc757e63e..037b69c23cb5 100644 --- a/Documentation/hwmon/hwmon-kernel-api.rst +++ b/Documentation/hwmon/hwmon-kernel-api.rst @@ -159,6 +159,7 @@ It contains following fields: hwmon_curr Current sensor hwmon_power Power sensor hwmon_energy Energy sensor + hwmon_energy64 Energy sensor, reported as 64-bit signed value hwmon_humidity Humidity sensor hwmon_fan Fan speed sensor hwmon_pwm PWM control @@ -288,6 +289,8 @@ Parameters: The sensor channel number. val: Pointer to attribute value. + For hwmon_energy64, `'val`' is passed as `long *` but needs + a typecast to `s64 *`. Return value: 0 on success, a negative error number otherwise. diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index 1688c210888a..2e17f3a4c59b 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -426,18 +426,22 @@ static ssize_t hwmon_attr_show(struct device *dev, struct device_attribute *devattr, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + s64 val64; long val; int ret; ret = hattr->ops->read(dev, hattr->type, hattr->attr, hattr->index, - &val); + (hattr->type == hwmon_energy64) ? (long *)&val64 : &val); if (ret < 0) return ret; + if (hattr->type != hwmon_energy64) + val64 = val; + trace_hwmon_attr_show(hattr->index + hwmon_attr_base(hattr->type), - hattr->name, val); + hattr->name, val64); - return sprintf(buf, "%ld\n", val); + return sprintf(buf, "%lld\n", val64); } static ssize_t hwmon_attr_show_string(struct device *dev, @@ -478,7 +482,7 @@ static ssize_t hwmon_attr_store(struct device *dev, return ret; trace_hwmon_attr_store(hattr->index + hwmon_attr_base(hattr->type), - hattr->name, val); + hattr->name, (s64)val); return count; } @@ -734,6 +738,7 @@ static const char * const *__templates[] = { [hwmon_curr] = hwmon_curr_attr_templates, [hwmon_power] = hwmon_power_attr_templates, [hwmon_energy] = hwmon_energy_attr_templates, + [hwmon_energy64] = hwmon_energy_attr_templates, [hwmon_humidity] = hwmon_humidity_attr_templates, [hwmon_fan] = hwmon_fan_attr_templates, [hwmon_pwm] = hwmon_pwm_attr_templates, @@ -747,6 +752,7 @@ static const int __templates_size[] = { [hwmon_curr] = ARRAY_SIZE(hwmon_curr_attr_templates), [hwmon_power] = ARRAY_SIZE(hwmon_power_attr_templates), [hwmon_energy] = ARRAY_SIZE(hwmon_energy_attr_templates), + [hwmon_energy64] = ARRAY_SIZE(hwmon_energy_attr_templates), [hwmon_humidity] = ARRAY_SIZE(hwmon_humidity_attr_templates), [hwmon_fan] = ARRAY_SIZE(hwmon_fan_attr_templates), [hwmon_pwm] = ARRAY_SIZE(hwmon_pwm_attr_templates), diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 3a63dff62d03..886fc90b2d25 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -24,6 +24,7 @@ enum hwmon_sensor_types { hwmon_curr, hwmon_power, hwmon_energy, + hwmon_energy64, hwmon_humidity, hwmon_fan, hwmon_pwm, diff --git a/include/trace/events/hwmon.h b/include/trace/events/hwmon.h index d1ff560cd9b5..3865098f21f1 100644 --- a/include/trace/events/hwmon.h +++ b/include/trace/events/hwmon.h @@ -9,14 +9,14 @@ DECLARE_EVENT_CLASS(hwmon_attr_class, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val), TP_STRUCT__entry( __field(int, index) __string(attr_name, attr_name) - __field(long, val) + __field(long long, val) ), TP_fast_assign( @@ -25,20 +25,20 @@ DECLARE_EVENT_CLASS(hwmon_attr_class, __entry->val = val; ), - TP_printk("index=%d, attr_name=%s, val=%ld", + TP_printk("index=%d, attr_name=%s, val=%lld", __entry->index, __get_str(attr_name), __entry->val) ); DEFINE_EVENT(hwmon_attr_class, hwmon_attr_show, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val) ); DEFINE_EVENT(hwmon_attr_class, hwmon_attr_store, - TP_PROTO(int index, const char *attr_name, long val), + TP_PROTO(int index, const char *attr_name, long long val), TP_ARGS(index, attr_name, val) ); -- cgit v1.2.3 From d364d2ad07873dc4991b2a631a8536597272418b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:11 +0200 Subject: devres: provide devm_kmemdup_const() Provide a function similar to devm_strdup_const() but for copying blocks of memory that are likely to be placed in .rodata. Reviewed-by: Andy Shevchenko Acked-by: Greg Kroah-Hartman Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/base/devres.c | 21 +++++++++++++++++++++ include/linux/device/devres.h | 2 ++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/devres.c b/drivers/base/devres.c index ff55e1bcfa30..c948c88d3956 100644 --- a/drivers/base/devres.c +++ b/drivers/base/devres.c @@ -1117,6 +1117,27 @@ void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp) } EXPORT_SYMBOL_GPL(devm_kmemdup); +/** + * devm_kmemdup_const - conditionally duplicate and manage a region of memory + * + * @dev: Device this memory belongs to + * @src: memory region to duplicate + * @len: memory region length, + * @gfp: GFP mask to use + * + * Return: source address if it is in .rodata or the return value of kmemdup() + * to which the function falls back otherwise. + */ +const void * +devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp) +{ + if (is_kernel_rodata((unsigned long)src)) + return src; + + return devm_kmemdup(dev, src, len, gfp); +} +EXPORT_SYMBOL_GPL(devm_kmemdup_const); + struct pages_devres { unsigned long addr; unsigned int order; diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h index ae696d10faff..8c5f57e0d613 100644 --- a/include/linux/device/devres.h +++ b/include/linux/device/devres.h @@ -80,6 +80,8 @@ void devm_kfree(struct device *dev, const void *p); void * __realloc_size(3) devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); +const void * +devm_kmemdup_const(struct device *dev, const void *src, size_t len, gfp_t gfp); static inline void *devm_kmemdup_array(struct device *dev, const void *src, size_t n, size_t size, gfp_t flags) { -- cgit v1.2.3 From 11aa02d6a9c222260490f952d041dec6d7f16a92 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 2 Sep 2025 13:59:22 +0200 Subject: pinctrl: allow to mark pin functions as requestable GPIOs The name of the pin function has no real meaning to pinctrl core and is there only for human readability of device properties. Some pins are muxed as GPIOs but for "strict" pinmuxers it's impossible to request them as GPIOs if they're bound to a devide - even if their function name explicitly says "gpio". Add a new field to struct pinfunction that allows to pass additional flags to pinctrl core. While we could go with a boolean "is_gpio" field, a flags field is more future-proof. If the PINFUNCTION_FLAG_GPIO is set for a given function, the pin muxed to it can be requested as GPIO even on strict pin controllers. Add a new callback to struct pinmux_ops - function_is_gpio() - that allows pinmux core to inspect a function and see if it's a GPIO one. Provide a generic implementation of this callback. Tested-by: Neil Armstrong Signed-off-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- drivers/pinctrl/pinmux.c | 46 ++++++++++++++++++++++++++++++++++++++--- drivers/pinctrl/pinmux.h | 3 +++ include/linux/pinctrl/pinctrl.h | 14 +++++++++++++ include/linux/pinctrl/pinmux.h | 2 ++ 4 files changed, 62 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/pinctrl/pinmux.c b/drivers/pinctrl/pinmux.c index 07ec93f09334..3a8dd184ba3d 100644 --- a/drivers/pinctrl/pinmux.c +++ b/drivers/pinctrl/pinmux.c @@ -89,13 +89,20 @@ bool pinmux_can_be_used_for_gpio(struct pinctrl_dev *pctldev, unsigned int pin) { struct pin_desc *desc = pin_desc_get(pctldev, pin); const struct pinmux_ops *ops = pctldev->desc->pmxops; + const struct pinctrl_setting_mux *mux_setting; + bool func_is_gpio = false; /* Can't inspect pin, assume it can be used */ if (!desc || !ops) return true; + mux_setting = desc->mux_setting; + guard(mutex)(&desc->mux_lock); - if (ops->strict && desc->mux_usecount) + if (mux_setting && ops->function_is_gpio) + func_is_gpio = ops->function_is_gpio(pctldev, mux_setting->func); + + if (ops->strict && desc->mux_usecount && !func_is_gpio) return false; return !(ops->strict && !!desc->gpio_owner); @@ -116,7 +123,9 @@ static int pin_request(struct pinctrl_dev *pctldev, { struct pin_desc *desc; const struct pinmux_ops *ops = pctldev->desc->pmxops; + const struct pinctrl_setting_mux *mux_setting; int status = -EINVAL; + bool gpio_ok = false; desc = pin_desc_get(pctldev, pin); if (desc == NULL) { @@ -126,11 +135,21 @@ static int pin_request(struct pinctrl_dev *pctldev, goto out; } + mux_setting = desc->mux_setting; + dev_dbg(pctldev->dev, "request pin %d (%s) for %s\n", pin, desc->name, owner); scoped_guard(mutex, &desc->mux_lock) { - if ((!gpio_range || ops->strict) && + if (mux_setting) { + if (ops->function_is_gpio) + gpio_ok = ops->function_is_gpio(pctldev, + mux_setting->func); + } else { + gpio_ok = true; + } + + if ((!gpio_range || ops->strict) && !gpio_ok && desc->mux_usecount && strcmp(desc->mux_owner, owner)) { dev_err(pctldev->dev, "pin %s already requested by %s; cannot claim for %s\n", @@ -138,7 +157,7 @@ static int pin_request(struct pinctrl_dev *pctldev, goto out; } - if ((gpio_range || ops->strict) && desc->gpio_owner) { + if ((gpio_range || ops->strict) && !gpio_ok && desc->gpio_owner) { dev_err(pctldev->dev, "pin %s already requested by %s; cannot claim for %s\n", desc->name, desc->gpio_owner, owner); @@ -861,6 +880,27 @@ pinmux_generic_get_function(struct pinctrl_dev *pctldev, unsigned int selector) } EXPORT_SYMBOL_GPL(pinmux_generic_get_function); +/** + * pinmux_generic_function_is_gpio() - returns true if given function is a GPIO + * @pctldev: pin controller device + * @selector: function number + * + * Returns: + * True if given function is a GPIO, false otherwise. + */ +bool pinmux_generic_function_is_gpio(struct pinctrl_dev *pctldev, + unsigned int selector) +{ + struct function_desc *function; + + function = radix_tree_lookup(&pctldev->pin_function_tree, selector); + if (!function) + return false; + + return function->func->flags & PINFUNCTION_FLAG_GPIO; +} +EXPORT_SYMBOL_GPL(pinmux_generic_function_is_gpio); + /** * pinmux_generic_add_function() - adds a function group * @pctldev: pin controller device diff --git a/drivers/pinctrl/pinmux.h b/drivers/pinctrl/pinmux.h index 653684290666..4e826c1a5246 100644 --- a/drivers/pinctrl/pinmux.h +++ b/drivers/pinctrl/pinmux.h @@ -169,6 +169,9 @@ int pinmux_generic_remove_function(struct pinctrl_dev *pctldev, void pinmux_generic_free_functions(struct pinctrl_dev *pctldev); +bool pinmux_generic_function_is_gpio(struct pinctrl_dev *pctldev, + unsigned int selector); + #else static inline void pinmux_generic_free_functions(struct pinctrl_dev *pctldev) diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index d138e1815645..1a8084e29405 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -11,6 +11,7 @@ #ifndef __LINUX_PINCTRL_PINCTRL_H #define __LINUX_PINCTRL_PINCTRL_H +#include #include struct device; @@ -206,16 +207,20 @@ extern int pinctrl_get_group_pins(struct pinctrl_dev *pctldev, const char *pin_group, const unsigned int **pins, unsigned int *num_pins); +#define PINFUNCTION_FLAG_GPIO BIT(0) + /** * struct pinfunction - Description about a function * @name: Name of the function * @groups: An array of groups for this function * @ngroups: Number of groups in @groups + * @flags: Additional pin function flags */ struct pinfunction { const char *name; const char * const *groups; size_t ngroups; + unsigned long flags; }; /* Convenience macro to define a single named pinfunction */ @@ -226,6 +231,15 @@ struct pinfunction { .ngroups = (_ngroups), \ } +/* Same as PINCTRL_PINFUNCTION() but for the GPIO category of functions */ +#define PINCTRL_GPIO_PINFUNCTION(_name, _groups, _ngroups) \ +(struct pinfunction) { \ + .name = (_name), \ + .groups = (_groups), \ + .ngroups = (_ngroups), \ + .flags = PINFUNCTION_FLAG_GPIO, \ + } + #if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_PINCTRL) extern struct pinctrl_dev *of_pinctrl_get(struct device_node *np); #else diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index d6f7b58d6ad0..6db6c3e1ccc2 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -66,6 +66,8 @@ struct pinmux_ops { unsigned int selector, const char * const **groups, unsigned int *num_groups); + bool (*function_is_gpio) (struct pinctrl_dev *pctldev, + unsigned int selector); int (*set_mux) (struct pinctrl_dev *pctldev, unsigned int func_selector, unsigned int group_selector); int (*gpio_request_enable) (struct pinctrl_dev *pctldev, -- cgit v1.2.3 From 203a83112e097a501fbe12722b6342787497efe0 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 5 Sep 2025 11:21:50 +0200 Subject: pinctrl: generic: rename PIN_CONFIG_OUTPUT to LEVEL This generic pin config property is confusingly named so let's rename it to make things clearer. There are already drivers in the tree that use PIN_CONFIG_OUTPUT to *read* the value of an output driven pin, which is a big semantic confusion for the head: are we then reading the setting of the output or the actual value/level that is put out on the pin? We already have PIN_CONFIG_OUTPUT_ENABLE that turns on driver buffers for output, so this can by logical conclusion only drive the voltage level if it should be any different. But if we read the pin, are we then reading the *setting* of the output value or the *actual* value we can see on the line? If the pin has not first been set into output mode with PIN_CONFIG_OUTPUT_ENABLE, but is instead in some input mode or tristate, what will reading this property actually return? Reading the current users reading this property it is clear that what we read is the logical level of the pin as 0 or 1 depending on if it is low or high. Rename it to PIN_CONFIG_LEVEL so it is crystal clear that we set or read the voltage level of the pin and nothing else. Acked-by: Sudeep Holla Signed-off-by: Linus Walleij --- Documentation/driver-api/pin-control.rst | 4 ++-- drivers/gpio/gpio-rockchip.c | 2 +- drivers/pinctrl/bcm/pinctrl-bcm2835.c | 6 +++--- drivers/pinctrl/cirrus/pinctrl-madera-core.c | 4 ++-- drivers/pinctrl/mediatek/pinctrl-airoha.c | 4 ++-- drivers/pinctrl/mediatek/pinctrl-moore.c | 2 +- drivers/pinctrl/mediatek/pinctrl-mtk-common.c | 2 +- drivers/pinctrl/mediatek/pinctrl-paris.c | 4 ++-- drivers/pinctrl/meson/pinctrl-amlogic-a4.c | 6 +++--- drivers/pinctrl/meson/pinctrl-meson.c | 6 +++--- drivers/pinctrl/nomadik/pinctrl-abx500.c | 6 +++--- drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c | 6 +++--- drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c | 6 +++--- drivers/pinctrl/pinconf-generic.c | 6 +++--- drivers/pinctrl/pinctrl-at91-pio4.c | 2 +- drivers/pinctrl/pinctrl-aw9523.c | 6 +++--- drivers/pinctrl/pinctrl-cy8c95x0.c | 2 +- drivers/pinctrl/pinctrl-ingenic.c | 4 ++-- drivers/pinctrl/pinctrl-k210.c | 2 +- drivers/pinctrl/pinctrl-microchip-sgpio.c | 4 ++-- drivers/pinctrl/pinctrl-ocelot.c | 4 ++-- drivers/pinctrl/pinctrl-pic32.c | 4 ++-- drivers/pinctrl/pinctrl-rk805.c | 4 ++-- drivers/pinctrl/pinctrl-rockchip.c | 6 +++--- drivers/pinctrl/pinctrl-rp1.c | 2 +- drivers/pinctrl/pinctrl-scmi.c | 2 +- drivers/pinctrl/pinctrl-stmfx.c | 4 ++-- drivers/pinctrl/pinctrl-sx150x.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-lpass-lpi.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-msm.c | 6 +++--- drivers/pinctrl/qcom/pinctrl-spmi-gpio.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-spmi-mpp.c | 8 ++++---- drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c | 4 ++-- drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c | 4 ++-- drivers/pinctrl/renesas/pinctrl-rza1.c | 2 +- drivers/pinctrl/stm32/pinctrl-stm32.c | 2 +- drivers/pinctrl/sunplus/sppctl.c | 4 ++-- include/linux/pinctrl/pinconf-generic.h | 12 ++++++++---- sound/hda/codecs/side-codecs/cirrus_scodec_test.c | 2 +- 39 files changed, 91 insertions(+), 87 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/pin-control.rst b/Documentation/driver-api/pin-control.rst index 27ea1236307e..8208924e513e 100644 --- a/Documentation/driver-api/pin-control.rst +++ b/Documentation/driver-api/pin-control.rst @@ -863,7 +863,7 @@ has to be handled by the ```` interface. Instead view thi a certain pin config setting. Look in e.g. ```` and you find this in the documentation: - PIN_CONFIG_OUTPUT: + PIN_CONFIG_LEVEL: this will configure the pin in output, use argument 1 to indicate high level, argument 0 to indicate low level. @@ -897,7 +897,7 @@ And your machine configuration may look like this: }; static unsigned long uart_sleep_mode[] = { - PIN_CONF_PACKED(PIN_CONFIG_OUTPUT, 0), + PIN_CONF_PACKED(PIN_CONFIG_LEVEL, 0), }; static struct pinctrl_map pinmap[] __initdata = { diff --git a/drivers/gpio/gpio-rockchip.c b/drivers/gpio/gpio-rockchip.c index bcfc323a8315..47174eb3ba76 100644 --- a/drivers/gpio/gpio-rockchip.c +++ b/drivers/gpio/gpio-rockchip.c @@ -769,7 +769,7 @@ static int rockchip_gpio_probe(struct platform_device *pdev) list_del(&cfg->head); switch (cfg->param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = rockchip_gpio_direction_output(&bank->gpio_chip, cfg->pin, cfg->arg); if (ret) dev_warn(dev, "setting output pin %u to %u failed\n", cfg->pin, diff --git a/drivers/pinctrl/bcm/pinctrl-bcm2835.c b/drivers/pinctrl/bcm/pinctrl-bcm2835.c index 7dbf079739bc..c165674c5b4d 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm2835.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm2835.c @@ -1023,7 +1023,7 @@ static int bcm2835_pinconf_get(struct pinctrl_dev *pctldev, /* No way to read back bias config in HW */ switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (fsel != BCM2835_FSEL_GPIO_OUT) return -EINVAL; @@ -1091,7 +1091,7 @@ static int bcm2835_pinconf_set(struct pinctrl_dev *pctldev, break; /* Set output-high or output-low */ - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bcm2835_gpio_set_bit(pc, arg ? GPSET0 : GPCLR0, pin); break; @@ -1202,7 +1202,7 @@ static int bcm2711_pinconf_set(struct pinctrl_dev *pctldev, break; /* Set output-high or output-low */ - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bcm2835_gpio_set_bit(pc, arg ? GPSET0 : GPCLR0, pin); break; diff --git a/drivers/pinctrl/cirrus/pinctrl-madera-core.c b/drivers/pinctrl/cirrus/pinctrl-madera-core.c index d19ef13224cc..1d9481b17091 100644 --- a/drivers/pinctrl/cirrus/pinctrl-madera-core.c +++ b/drivers/pinctrl/cirrus/pinctrl-madera-core.c @@ -804,7 +804,7 @@ static int madera_pin_conf_get(struct pinctrl_dev *pctldev, unsigned int pin, if (conf[0] & MADERA_GP1_IP_CFG_MASK) result = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if ((conf[1] & MADERA_GP1_DIR_MASK) && (conf[0] & MADERA_GP1_LVL_MASK)) result = 1; @@ -902,7 +902,7 @@ static int madera_pin_conf_set(struct pinctrl_dev *pctldev, unsigned int pin, mask[1] |= MADERA_GP1_DIR_MASK; conf[1] |= MADERA_GP1_DIR; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: val = pinconf_to_config_argument(*configs); mask[0] |= MADERA_GP1_LVL_MASK; if (val) diff --git a/drivers/pinctrl/mediatek/pinctrl-airoha.c b/drivers/pinctrl/mediatek/pinctrl-airoha.c index 8fb3b65a1b77..c1e736786f5c 100644 --- a/drivers/pinctrl/mediatek/pinctrl-airoha.c +++ b/drivers/pinctrl/mediatek/pinctrl-airoha.c @@ -2765,7 +2765,7 @@ static int airoha_pinconf_set(struct pinctrl_dev *pctrl_dev, break; case PIN_CONFIG_OUTPUT_ENABLE: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: { + case PIN_CONFIG_LEVEL: { bool input = param == PIN_CONFIG_INPUT_ENABLE; int err; @@ -2774,7 +2774,7 @@ static int airoha_pinconf_set(struct pinctrl_dev *pctrl_dev, if (err) return err; - if (param == PIN_CONFIG_OUTPUT) { + if (param == PIN_CONFIG_LEVEL) { err = airoha_pinconf_set_pin_value(pctrl_dev, pin, !!arg); if (err) diff --git a/drivers/pinctrl/mediatek/pinctrl-moore.c b/drivers/pinctrl/mediatek/pinctrl-moore.c index 11dc525eb3a2..70f608347a5f 100644 --- a/drivers/pinctrl/mediatek/pinctrl-moore.c +++ b/drivers/pinctrl/mediatek/pinctrl-moore.c @@ -332,7 +332,7 @@ static int mtk_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, goto err; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_DIR, MTK_OUTPUT); if (err) diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c index d10306024111..d6a46fe0cda8 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c +++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c @@ -384,7 +384,7 @@ static int mtk_pconf_parse_conf(struct pinctrl_dev *pctldev, mtk_pmx_gpio_set_direction(pctldev, NULL, pin, true); ret = mtk_pconf_set_ies_smt(pctl, pin, arg, param); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: mtk_gpio_set(pctl->chip, pin, arg); ret = mtk_pmx_gpio_set_direction(pctldev, NULL, pin, false); break; diff --git a/drivers/pinctrl/mediatek/pinctrl-paris.c b/drivers/pinctrl/mediatek/pinctrl-paris.c index 3e714554789d..6bf37d8085fa 100644 --- a/drivers/pinctrl/mediatek/pinctrl-paris.c +++ b/drivers/pinctrl/mediatek/pinctrl-paris.c @@ -169,7 +169,7 @@ static int mtk_pinconf_get(struct pinctrl_dev *pctldev, if (!ret) err = -EINVAL; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_get_value(hw, desc, PINCTRL_PIN_REG_DIR, &ret); if (err) break; @@ -292,7 +292,7 @@ static int mtk_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, /* regard all non-zero value as enable */ err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_SR, !!arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_DO, arg); if (err) diff --git a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c index e34e984c2b38..d2b15f133564 100644 --- a/drivers/pinctrl/meson/pinctrl-amlogic-a4.c +++ b/drivers/pinctrl/meson/pinctrl-amlogic-a4.c @@ -422,7 +422,7 @@ static int aml_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = aml_pinconf_get_output(info, pin); if (ret <= 0) return -EINVAL; @@ -568,7 +568,7 @@ static int aml_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, switch (param) { case PIN_CONFIG_DRIVE_STRENGTH_UA: case PIN_CONFIG_OUTPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pinconf_to_config_argument(configs[i]); break; @@ -592,7 +592,7 @@ static int aml_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: ret = aml_pinconf_set_output(info, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = aml_pinconf_set_output_drive(info, pin, arg); break; default: diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c index 277e9c40490d..18295b15ecd9 100644 --- a/drivers/pinctrl/meson/pinctrl-meson.c +++ b/drivers/pinctrl/meson/pinctrl-meson.c @@ -360,7 +360,7 @@ static int meson_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, switch (param) { case PIN_CONFIG_DRIVE_STRENGTH_UA: case PIN_CONFIG_OUTPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pinconf_to_config_argument(configs[i]); break; @@ -384,7 +384,7 @@ static int meson_pinconf_set(struct pinctrl_dev *pcdev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: ret = meson_pinconf_set_output(pc, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = meson_pinconf_set_output_drive(pc, pin, arg); break; default: @@ -502,7 +502,7 @@ static int meson_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = meson_pinconf_get_output(pc, pin); if (ret <= 0) return -EINVAL; diff --git a/drivers/pinctrl/nomadik/pinctrl-abx500.c b/drivers/pinctrl/nomadik/pinctrl-abx500.c index 7b5f94d8cb23..fc7ebeda8440 100644 --- a/drivers/pinctrl/nomadik/pinctrl-abx500.c +++ b/drivers/pinctrl/nomadik/pinctrl-abx500.c @@ -860,8 +860,8 @@ static int abx500_pin_config_set(struct pinctrl_dev *pctldev, dev_dbg(chip->parent, "pin %d [%#lx]: %s %s\n", pin, configs[i], - (param == PIN_CONFIG_OUTPUT) ? "output " : "input", - (param == PIN_CONFIG_OUTPUT) ? + (param == PIN_CONFIG_LEVEL) ? "output " : "input", + (param == PIN_CONFIG_LEVEL) ? str_high_low(argument) : (argument ? "pull up" : "pull down")); @@ -907,7 +907,7 @@ static int abx500_pin_config_set(struct pinctrl_dev *pctldev, ret = abx500_gpio_direction_input(chip, offset); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = abx500_gpio_direction_output(chip, offset, argument); break; diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c index c2ca71ebb973..7f45c2897c3f 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c @@ -1700,13 +1700,13 @@ static int npcm7xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, else if (param == PIN_CONFIG_BIAS_PULL_DOWN) rc = (!pu && pd); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: ie = ioread32(bank->base + NPCM7XX_GP_N_IEM) & pinmask; oe = ioread32(bank->base + NPCM7XX_GP_N_OE) & pinmask; if (param == PIN_CONFIG_INPUT_ENABLE) rc = (ie && !oe); - else if (param == PIN_CONFIG_OUTPUT) + else if (param == PIN_CONFIG_LEVEL) rc = (!ie && oe); break; case PIN_CONFIG_DRIVE_PUSH_PULL: @@ -1765,7 +1765,7 @@ static int npcm7xx_config_set_one(struct npcm7xx_pinctrl *npcm, iowrite32(gpio, bank->base + NPCM7XX_GP_N_OEC); bank->direction_input(&bank->chip.gc, pin % bank->chip.gc.ngpio); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bank->direction_output(&bank->chip.gc, pin % bank->chip.gc.ngpio, arg); iowrite32(gpio, bank->base + NPCM7XX_GP_N_OES); break; diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c index 0f155a685bba..920dd2077925 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c @@ -2187,13 +2187,13 @@ static int npcm8xx_config_get(struct pinctrl_dev *pctldev, unsigned int pin, else if (param == PIN_CONFIG_BIAS_PULL_DOWN) rc = !pu && pd; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: ie = ioread32(bank->base + NPCM8XX_GP_N_IEM) & pinmask; oe = ioread32(bank->base + NPCM8XX_GP_N_OE) & pinmask; if (param == PIN_CONFIG_INPUT_ENABLE) rc = (ie && !oe); - else if (param == PIN_CONFIG_OUTPUT) + else if (param == PIN_CONFIG_LEVEL) rc = (!ie && oe); break; case PIN_CONFIG_DRIVE_PUSH_PULL: @@ -2251,7 +2251,7 @@ static int npcm8xx_config_set_one(struct npcm8xx_pinctrl *npcm, iowrite32(gpio, bank->base + NPCM8XX_GP_N_OEC); bank->direction_input(&bank->chip.gc, pin % bank->chip.gc.ngpio); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: bank->direction_output(&bank->chip.gc, pin % bank->chip.gc.ngpio, arg); iowrite32(gpio, bank->base + NPCM8XX_GP_N_OES); break; diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index d67838afb085..5de6ff62c69b 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -48,7 +48,7 @@ static const struct pin_config_item conf_items[] = { PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT_ENABLE, "input schmitt enabled", NULL, false), PCONFDUMP(PIN_CONFIG_MODE_LOW_POWER, "pin low power", "mode", true), PCONFDUMP(PIN_CONFIG_OUTPUT_ENABLE, "output enabled", NULL, false), - PCONFDUMP(PIN_CONFIG_OUTPUT, "pin output", "level", true), + PCONFDUMP(PIN_CONFIG_LEVEL, "pin output", "level", true), PCONFDUMP(PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, "output impedance", "ohms", true), PCONFDUMP(PIN_CONFIG_POWER_SOURCE, "pin power source", "selector", true), PCONFDUMP(PIN_CONFIG_SLEEP_HARDWARE_STATE, "sleep hardware state", NULL, false), @@ -183,9 +183,9 @@ static const struct pinconf_generic_params dt_params[] = { { "low-power-enable", PIN_CONFIG_MODE_LOW_POWER, 1 }, { "output-disable", PIN_CONFIG_OUTPUT_ENABLE, 0 }, { "output-enable", PIN_CONFIG_OUTPUT_ENABLE, 1 }, - { "output-high", PIN_CONFIG_OUTPUT, 1, }, + { "output-high", PIN_CONFIG_LEVEL, 1, }, { "output-impedance-ohms", PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, 0 }, - { "output-low", PIN_CONFIG_OUTPUT, 0, }, + { "output-low", PIN_CONFIG_LEVEL, 0, }, { "power-source", PIN_CONFIG_POWER_SOURCE, 0 }, { "sleep-hardware-state", PIN_CONFIG_SLEEP_HARDWARE_STATE, 0 }, { "slew-rate", PIN_CONFIG_SLEW_RATE, 0 }, diff --git a/drivers/pinctrl/pinctrl-at91-pio4.c b/drivers/pinctrl/pinctrl-at91-pio4.c index 35ea3414cb96..ec5351fc282e 100644 --- a/drivers/pinctrl/pinctrl-at91-pio4.c +++ b/drivers/pinctrl/pinctrl-at91-pio4.c @@ -862,7 +862,7 @@ static int atmel_conf_pin_config_group_set(struct pinctrl_dev *pctldev, conf |= ATMEL_PIO_IFSCEN_MASK; } break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: conf |= ATMEL_PIO_DIR_MASK; bank = ATMEL_PIO_BANK(pin_id); pin = ATMEL_PIO_LINE(pin_id); diff --git a/drivers/pinctrl/pinctrl-aw9523.c b/drivers/pinctrl/pinctrl-aw9523.c index 890b83fddea3..479553a79216 100644 --- a/drivers/pinctrl/pinctrl-aw9523.c +++ b/drivers/pinctrl/pinctrl-aw9523.c @@ -215,7 +215,7 @@ static int aw9523_pcfg_param_to_reg(enum pin_config_param pcp, int pin, u8 *r) case PIN_CONFIG_OUTPUT_ENABLE: reg = AW9523_REG_CONF_STATE(pin); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: reg = AW9523_REG_OUT_STATE(pin); break; default: @@ -249,7 +249,7 @@ static int aw9523_pconf_get(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_BIAS_PULL_UP: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: val &= BIT(regbit); break; case PIN_CONFIG_BIAS_PULL_DOWN: @@ -301,7 +301,7 @@ static int aw9523_pconf_set(struct pinctrl_dev *pctldev, unsigned int pin, goto end; switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* First, enable pin output */ rc = regmap_update_bits(awi->regmap, AW9523_REG_CONF_STATE(pin), diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c index cf7f80497fde..a4b04bf6d081 100644 --- a/drivers/pinctrl/pinctrl-cy8c95x0.c +++ b/drivers/pinctrl/pinctrl-cy8c95x0.c @@ -808,7 +808,7 @@ static int cy8c95x0_gpio_get_pincfg(struct cy8c95x0_pinctrl *chip, case PIN_CONFIG_MODE_PWM: reg = CY8C95X0_SELPWM; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: reg = CY8C95X0_OUTPUT; break; case PIN_CONFIG_OUTPUT_ENABLE: diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index e5b24fab12e1..c7f14546de05 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -4264,7 +4264,7 @@ static int ingenic_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_BIAS_PULL_UP: case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_INPUT_SCHMITT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_SLEW_RATE: continue; default: @@ -4305,7 +4305,7 @@ static int ingenic_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, ingenic_set_schmitt_trigger(jzpc, pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = pinctrl_gpio_direction_output(jzpc->gc, pin - jzpc->gc->base); if (ret) diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c index 66c04120c29d..ddd6d6bfd513 100644 --- a/drivers/pinctrl/pinctrl-k210.c +++ b/drivers/pinctrl/pinctrl-k210.c @@ -551,7 +551,7 @@ static int k210_pinconf_set_param(struct pinctrl_dev *pctldev, else val &= ~K210_PC_ST; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: k210_pinmux_set_pin_function(pctldev, pin, K210_PCF_CONSTANT); val = readl(&pdata->fpioa->pins[pin]); val |= K210_PC_MODE_OUT; diff --git a/drivers/pinctrl/pinctrl-microchip-sgpio.c b/drivers/pinctrl/pinctrl-microchip-sgpio.c index 069cc665a402..b6363f3cdce9 100644 --- a/drivers/pinctrl/pinctrl-microchip-sgpio.c +++ b/drivers/pinctrl/pinctrl-microchip-sgpio.c @@ -371,7 +371,7 @@ static int sgpio_pinconf_get(struct pinctrl_dev *pctldev, val = !bank->is_input; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (bank->is_input) return -EINVAL; val = sgpio_output_get(priv, &addr); @@ -402,7 +402,7 @@ static int sgpio_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, arg = pinconf_to_config_argument(configs[cfg]); switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (bank->is_input) return -EINVAL; err = sgpio_output_set(priv, &addr, arg); diff --git a/drivers/pinctrl/pinctrl-ocelot.c b/drivers/pinctrl/pinctrl-ocelot.c index b82bf83fed25..70da3f37567a 100644 --- a/drivers/pinctrl/pinctrl-ocelot.c +++ b/drivers/pinctrl/pinctrl-ocelot.c @@ -1656,7 +1656,7 @@ static int ocelot_pinconf_get(struct pinctrl_dev *pctldev, return err; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: err = regmap_read(info->map, REG(OCELOT_GPIO_OUT, info, pin), &val); if (err) @@ -1735,7 +1735,7 @@ static int ocelot_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: p = pin % 32; if (arg) regmap_write(info->map, diff --git a/drivers/pinctrl/pinctrl-pic32.c b/drivers/pinctrl/pinctrl-pic32.c index 37c2bf752154..e8b481e87c77 100644 --- a/drivers/pinctrl/pinctrl-pic32.c +++ b/drivers/pinctrl/pinctrl-pic32.c @@ -1905,7 +1905,7 @@ static int pic32_pinconf_get(struct pinctrl_dev *pctldev, unsigned pin, case PIN_CONFIG_INPUT_ENABLE: arg = !!(readl(bank->reg_base + TRIS_REG) & mask); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = !(readl(bank->reg_base + TRIS_REG) & mask); break; default: @@ -1960,7 +1960,7 @@ static int pic32_pinconf_set(struct pinctrl_dev *pctldev, unsigned pin, case PIN_CONFIG_INPUT_ENABLE: pic32_gpio_direction_input(&bank->gpio_chip, offset); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pic32_gpio_direction_output(&bank->gpio_chip, offset, arg); break; diff --git a/drivers/pinctrl/pinctrl-rk805.c b/drivers/pinctrl/pinctrl-rk805.c index 3acf770316c1..22f576337faa 100644 --- a/drivers/pinctrl/pinctrl-rk805.c +++ b/drivers/pinctrl/pinctrl-rk805.c @@ -541,7 +541,7 @@ static int rk805_pinconf_get(struct pinctrl_dev *pctldev, u32 arg = 0; switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: arg = rk805_gpio_get(&pci->gpio_chip, pin); break; @@ -568,7 +568,7 @@ static int rk805_pinconf_set(struct pinctrl_dev *pctldev, arg = pinconf_to_config_argument(configs[i]); switch (param) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rk805_gpio_set(&pci->gpio_chip, pin, arg); rk805_pmx_gpio_set_direction(pctldev, NULL, pin, false); break; diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c index 930c454e0cec..7a68a6237649 100644 --- a/drivers/pinctrl/pinctrl-rockchip.c +++ b/drivers/pinctrl/pinctrl-rockchip.c @@ -3272,7 +3272,7 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, param = pinconf_to_config_param(configs[i]); arg = pinconf_to_config_argument(configs[i]); - if (param == PIN_CONFIG_OUTPUT || param == PIN_CONFIG_INPUT_ENABLE) { + if (param == PIN_CONFIG_LEVEL || param == PIN_CONFIG_INPUT_ENABLE) { /* * Check for gpio driver not being probed yet. * The lock makes sure that either gpio-probe has completed @@ -3313,7 +3313,7 @@ static int rockchip_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, if (rc) return rc; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rc = rockchip_set_mux(bank, pin - bank->pin_base, RK_FUNC_GPIO); if (rc != RK_FUNC_GPIO) @@ -3392,7 +3392,7 @@ static int rockchip_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rc = rockchip_get_mux(bank, pin - bank->pin_base); if (rc != RK_FUNC_GPIO) return -EINVAL; diff --git a/drivers/pinctrl/pinctrl-rp1.c b/drivers/pinctrl/pinctrl-rp1.c index 605105aa0cbc..ffc2f0b460a6 100644 --- a/drivers/pinctrl/pinctrl-rp1.c +++ b/drivers/pinctrl/pinctrl-rp1.c @@ -1440,7 +1440,7 @@ static int rp1_pinconf_set(struct pinctrl_dev *pctldev, unsigned int offset, rp1_output_enable(pin, arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: rp1_set_value(pin, arg); rp1_set_dir(pin, RP1_DIR_OUTPUT); rp1_set_fsel(pin, RP1_FSEL_GPIO); diff --git a/drivers/pinctrl/pinctrl-scmi.c b/drivers/pinctrl/pinctrl-scmi.c index 383681041e4c..d14528b9aa31 100644 --- a/drivers/pinctrl/pinctrl-scmi.c +++ b/drivers/pinctrl/pinctrl-scmi.c @@ -253,7 +253,7 @@ static int pinctrl_scmi_map_pinconf_type(enum pin_config_param param, case PIN_CONFIG_MODE_LOW_POWER: *type = SCMI_PIN_LOW_POWER_MODE; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: *type = SCMI_PIN_OUTPUT_VALUE; break; case PIN_CONFIG_OUTPUT_ENABLE: diff --git a/drivers/pinctrl/pinctrl-stmfx.c b/drivers/pinctrl/pinctrl-stmfx.c index c89b99003b71..03ee13844b50 100644 --- a/drivers/pinctrl/pinctrl-stmfx.c +++ b/drivers/pinctrl/pinctrl-stmfx.c @@ -267,7 +267,7 @@ static int stmfx_pinconf_get(struct pinctrl_dev *pctldev, if ((!dir && !type) || (dir && type)) arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (dir) return -EINVAL; @@ -334,7 +334,7 @@ static int stmfx_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, if (ret) return ret; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = stmfx_gpio_direction_output(&pctl->gpio_chip, pin, arg); if (ret) diff --git a/drivers/pinctrl/pinctrl-sx150x.c b/drivers/pinctrl/pinctrl-sx150x.c index b613568b42b7..1d6760ffe809 100644 --- a/drivers/pinctrl/pinctrl-sx150x.c +++ b/drivers/pinctrl/pinctrl-sx150x.c @@ -611,7 +611,7 @@ static int sx150x_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, if (sx150x_pin_is_oscio(pctl, pin)) { switch (param) { case PIN_CONFIG_DRIVE_PUSH_PULL: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = regmap_read(pctl->regmap, pctl->data->pri.x789.reg_clock, &data); @@ -705,7 +705,7 @@ static int sx150x_pinconf_get(struct pinctrl_dev *pctldev, unsigned int pin, } break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = sx150x_gpio_get_direction(&pctl->gpio, pin); if (ret < 0) return ret; @@ -744,7 +744,7 @@ static int sx150x_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, arg = pinconf_to_config_argument(configs[i]); if (sx150x_pin_is_oscio(pctl, pin)) { - if (param == PIN_CONFIG_OUTPUT) { + if (param == PIN_CONFIG_LEVEL) { ret = sx150x_gpio_direction_output(&pctl->gpio, pin, arg); if (ret < 0) @@ -816,7 +816,7 @@ static int sx150x_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: ret = sx150x_gpio_direction_output(&pctl->gpio, pin, arg); if (ret < 0) diff --git a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c index 54c77e0b96e9..28eb92a2b249 100644 --- a/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c +++ b/drivers/pinctrl/qcom/pinctrl-lpass-lpi.c @@ -174,7 +174,7 @@ static int lpi_config_get(struct pinctrl_dev *pctldev, arg = 1; break; case PIN_CONFIG_INPUT_ENABLE: - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (is_out) arg = 1; break; @@ -252,7 +252,7 @@ static int lpi_config_set(struct pinctrl_dev *pctldev, unsigned int group, case PIN_CONFIG_INPUT_ENABLE: output_enabled = false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: output_enabled = true; value = arg; break; @@ -314,7 +314,7 @@ static int lpi_gpio_direction_output(struct gpio_chip *chip, struct lpi_pinctrl *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return lpi_config_set(state->ctrl, pin, &config, 1); } @@ -332,7 +332,7 @@ static int lpi_gpio_set(struct gpio_chip *chip, unsigned int pin, int value) struct lpi_pinctrl *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return lpi_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index 1751d838ce95..67525d542c5b 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -295,7 +295,7 @@ static int msm_config_reg(struct msm_pinctrl *pctrl, *bit = g->drv_bit; *mask = 7; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_INPUT_ENABLE: case PIN_CONFIG_OUTPUT_ENABLE: *bit = g->oe_bit; @@ -385,7 +385,7 @@ static int msm_config_group_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_DRIVE_STRENGTH: arg = msm_regval_to_drive(arg); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* Pin is not output */ if (!arg) return -EINVAL; @@ -464,7 +464,7 @@ static int msm_config_group_set(struct pinctrl_dev *pctldev, else arg = (arg / 2) - 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: /* set output value */ raw_spin_lock_irqsave(&pctrl->lock, flags); val = msm_readl_io(pctrl, g); diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c index b7b15874e488..485b68cc93f8 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c @@ -438,7 +438,7 @@ static int pmic_gpio_config_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_OUTPUT_ENABLE: arg = pad->output_enabled; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pad->out_value; break; case PMIC_GPIO_CONF_PULL_UP: @@ -530,7 +530,7 @@ static int pmic_gpio_config_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_OUTPUT_ENABLE: pad->output_enabled = arg ? true : false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pad->output_enabled = true; pad->out_value = arg; break; @@ -737,7 +737,7 @@ static int pmic_gpio_direction_output(struct gpio_chip *chip, struct pmic_gpio_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return pmic_gpio_config_set(state->ctrl, pin, &config, 1); } @@ -769,7 +769,7 @@ static int pmic_gpio_set(struct gpio_chip *chip, unsigned int pin, int value) struct pmic_gpio_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return pmic_gpio_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c b/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c index 22d76b1013a3..64f8024a865c 100644 --- a/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c +++ b/drivers/pinctrl/qcom/pinctrl-spmi-mpp.c @@ -370,7 +370,7 @@ static int pmic_mpp_config_get(struct pinctrl_dev *pctldev, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pad->out_value; break; case PMIC_MPP_CONF_DTEST_SELECTOR: @@ -447,7 +447,7 @@ static int pmic_mpp_config_set(struct pinctrl_dev *pctldev, unsigned int pin, case PIN_CONFIG_INPUT_ENABLE: pad->input_enabled = arg ? true : false; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pad->output_enabled = true; pad->out_value = arg; break; @@ -576,7 +576,7 @@ static int pmic_mpp_direction_output(struct gpio_chip *chip, struct pmic_mpp_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, val); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, val); return pmic_mpp_config_set(state->ctrl, pin, &config, 1); } @@ -605,7 +605,7 @@ static int pmic_mpp_set(struct gpio_chip *chip, unsigned int pin, int value) struct pmic_mpp_state *state = gpiochip_get_data(chip); unsigned long config; - config = pinconf_to_config_packed(PIN_CONFIG_OUTPUT, value); + config = pinconf_to_config_packed(PIN_CONFIG_LEVEL, value); return pmic_mpp_config_set(state->ctrl, pin, &config, 1); } diff --git a/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c b/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c index fb37b1c1acb4..5c966d51eda7 100644 --- a/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c +++ b/drivers/pinctrl/qcom/pinctrl-ssbi-gpio.c @@ -282,7 +282,7 @@ static int pm8xxx_pin_config_get(struct pinctrl_dev *pctldev, return -EINVAL; arg = 1; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (pin->mode & PM8XXX_GPIO_MODE_OUTPUT) arg = pin->output_value; else @@ -364,7 +364,7 @@ static int pm8xxx_pin_config_set(struct pinctrl_dev *pctldev, pin->mode = PM8XXX_GPIO_MODE_INPUT; banks |= BIT(0) | BIT(1); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pin->mode = PM8XXX_GPIO_MODE_OUTPUT; pin->output_value = !!arg; banks |= BIT(0) | BIT(1); diff --git a/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c b/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c index 6103849af042..7970fa6e1557 100644 --- a/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c +++ b/drivers/pinctrl/qcom/pinctrl-ssbi-mpp.c @@ -337,7 +337,7 @@ static int pm8xxx_pin_config_get(struct pinctrl_dev *pctldev, case PIN_CONFIG_INPUT_ENABLE: arg = pin->input; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: arg = pin->output_value; break; case PIN_CONFIG_POWER_SOURCE: @@ -392,7 +392,7 @@ static int pm8xxx_pin_config_set(struct pinctrl_dev *pctldev, case PIN_CONFIG_INPUT_ENABLE: pin->input = true; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: pin->output = true; pin->output_value = !!arg; break; diff --git a/drivers/pinctrl/renesas/pinctrl-rza1.c b/drivers/pinctrl/renesas/pinctrl-rza1.c index 70f22e0ef307..f24e5915cbe4 100644 --- a/drivers/pinctrl/renesas/pinctrl-rza1.c +++ b/drivers/pinctrl/renesas/pinctrl-rza1.c @@ -933,7 +933,7 @@ static int rza1_parse_pinmux_node(struct rza1_pinctrl *rza1_pctl, case PIN_CONFIG_INPUT_ENABLE: pinmux_flags |= MUX_FLAGS_SWIO_INPUT; break; - case PIN_CONFIG_OUTPUT: /* for DT backwards compatibility */ + case PIN_CONFIG_LEVEL: /* for DT backwards compatibility */ case PIN_CONFIG_OUTPUT_ENABLE: pinmux_flags |= MUX_FLAGS_SWIO_OUTPUT; break; diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c index 823c8fe758e2..3ebb468de830 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32.c @@ -1236,7 +1236,7 @@ static int stm32_pconf_parse_conf(struct pinctrl_dev *pctldev, case PIN_CONFIG_BIAS_PULL_DOWN: ret = stm32_pconf_set_bias(bank, offset, 2); break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: __stm32_gpio_set(bank, offset, arg); ret = stm32_pmx_gpio_set_direction(pctldev, range, pin, false); break; diff --git a/drivers/pinctrl/sunplus/sppctl.c b/drivers/pinctrl/sunplus/sppctl.c index 3e924aa86cc2..fabe7efaa837 100644 --- a/drivers/pinctrl/sunplus/sppctl.c +++ b/drivers/pinctrl/sunplus/sppctl.c @@ -488,7 +488,7 @@ static int sppctl_gpio_set_config(struct gpio_chip *chip, unsigned int offset, case PIN_CONFIG_INPUT_ENABLE: break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: return sppctl_gpio_direction_output(chip, offset, 0); case PIN_CONFIG_PERSIST_STATE: @@ -580,7 +580,7 @@ static int sppctl_pin_config_get(struct pinctrl_dev *pctldev, unsigned int pin, arg = 0; break; - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: if (!sppctl_first_get(&pctl->spp_gchip->chip, pin)) return -EINVAL; if (!sppctl_master_get(&pctl->spp_gchip->chip, pin)) diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 1bcf071b860e..d9245ecec71d 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -88,9 +88,13 @@ struct pinctrl_map; * passed in the argument on a custom form, else just use argument 1 * to indicate low power mode, argument 0 turns low power mode off. * @PIN_CONFIG_MODE_PWM: this will configure the pin for PWM - * @PIN_CONFIG_OUTPUT: this will configure the pin as an output and drive a - * value on the line. Use argument 1 to indicate high level, argument 0 to - * indicate low level. (Please see Documentation/driver-api/pin-control.rst, + * @PIN_CONFIG_LEVEL: setting this will configure the pin as an output and + * drive a value on the line. Use argument 1 to indicate high level, + * argument 0 to indicate low level. Conversely the value of the line + * can be read using this parameter, if and only if that value can be + * represented as a binary 0 or 1 where 0 indicate a low voltage level + * and 1 indicate a high voltage level. + * (Please see Documentation/driver-api/pin-control.rst, * section "GPIO mode pitfalls" for a discussion around this parameter.) * @PIN_CONFIG_OUTPUT_ENABLE: this will enable the pin's output mode * without driving a value there. For most platforms this reduces to @@ -137,7 +141,7 @@ enum pin_config_param { PIN_CONFIG_INPUT_SCHMITT_UV, PIN_CONFIG_MODE_LOW_POWER, PIN_CONFIG_MODE_PWM, - PIN_CONFIG_OUTPUT, + PIN_CONFIG_LEVEL, PIN_CONFIG_OUTPUT_ENABLE, PIN_CONFIG_OUTPUT_IMPEDANCE_OHMS, PIN_CONFIG_PERSIST_STATE, diff --git a/sound/hda/codecs/side-codecs/cirrus_scodec_test.c b/sound/hda/codecs/side-codecs/cirrus_scodec_test.c index 9ba14c09c07f..3cca750857b6 100644 --- a/sound/hda/codecs/side-codecs/cirrus_scodec_test.c +++ b/sound/hda/codecs/side-codecs/cirrus_scodec_test.c @@ -69,7 +69,7 @@ static int cirrus_scodec_test_gpio_set_config(struct gpio_chip *gc, unsigned long config) { switch (pinconf_to_config_param(config)) { - case PIN_CONFIG_OUTPUT: + case PIN_CONFIG_LEVEL: case PIN_CONFIG_OUTPUT_ENABLE: return -EOPNOTSUPP; default: -- cgit v1.2.3 From 5f3cec21f6d57336a2cbfa9ee428ac66c77a7211 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 29 Aug 2025 20:46:01 +0300 Subject: overflow: add range_overflows() and range_end_overflows() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the range_overflows() and range_end_overflows() along with the _t variants over from drm/i915 and drm/buddy to overflow.h. Cc: Kees Cook Cc: "Gustavo A. R. Silva" Cc: linux-hardening@vger.kernel.org Reviewed-by: Kees Cook Reviewed-by: Jouni Högander Acked-by: Thomas Zimmermann Link: https://lore.kernel.org/r/20250829174601.2163064-3-jani.nikula@intel.com Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_utils.h | 70 --------------------------------------- include/drm/drm_buddy.h | 9 ----- include/linux/overflow.h | 70 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 79 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h index 968dae941532..eb4d43c40009 100644 --- a/drivers/gpu/drm/i915/i915_utils.h +++ b/drivers/gpu/drm/i915/i915_utils.h @@ -67,76 +67,6 @@ bool i915_error_injected(void); drm_err(&(i915)->drm, fmt, ##__VA_ARGS__); \ }) -/** - * range_overflows() - Check if a range is out of bounds - * @start: Start of the range. - * @size: Size of the range. - * @max: Exclusive upper boundary. - * - * A strict check to determine if the range [@start, @start + @size) is - * invalid with respect to the allowable range [0, @max). Any range - * starting at or beyond @max is considered an overflow, even if @size is 0. - * - * Returns: true if the range is out of bounds. - */ -#define range_overflows(start, size, max) ({ \ - typeof(start) start__ = (start); \ - typeof(size) size__ = (size); \ - typeof(max) max__ = (max); \ - (void)(&start__ == &size__); \ - (void)(&start__ == &max__); \ - start__ >= max__ || size__ > max__ - start__; \ -}) - -/** - * range_overflows_t() - Check if a range is out of bounds - * @type: Data type to use. - * @start: Start of the range. - * @size: Size of the range. - * @max: Exclusive upper boundary. - * - * Same as range_overflows() but forcing the parameters to @type. - * - * Returns: true if the range is out of bounds. - */ -#define range_overflows_t(type, start, size, max) \ - range_overflows((type)(start), (type)(size), (type)(max)) - -/** - * range_end_overflows() - Check if a range's endpoint is out of bounds - * @start: Start of the range. - * @size: Size of the range. - * @max: Exclusive upper boundary. - * - * Checks only if the endpoint of a range (@start + @size) exceeds @max. - * Unlike range_overflows(), a zero-sized range at the boundary (@start == @max) - * is not considered an overflow. Useful for iterator-style checks. - * - * Returns: true if the endpoint exceeds the boundary. - */ -#define range_end_overflows(start, size, max) ({ \ - typeof(start) start__ = (start); \ - typeof(size) size__ = (size); \ - typeof(max) max__ = (max); \ - (void)(&start__ == &size__); \ - (void)(&start__ == &max__); \ - start__ > max__ || size__ > max__ - start__; \ -}) - -/** - * range_end_overflows_t() - Check if a range's endpoint is out of bounds - * @type: Data type to use. - * @start: Start of the range. - * @size: Size of the range. - * @max: Exclusive upper boundary. - * - * Same as range_end_overflows() but forcing the parameters to @type. - * - * Returns: true if the endpoint exceeds the boundary. - */ -#define range_end_overflows_t(type, start, size, max) \ - range_end_overflows((type)(start), (type)(size), (type)(max)) - #define ptr_mask_bits(ptr, n) ({ \ unsigned long __v = (unsigned long)(ptr); \ (typeof(ptr))(__v & -BIT(n)); \ diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h index 9689a7c5dd36..236971681514 100644 --- a/include/drm/drm_buddy.h +++ b/include/drm/drm_buddy.h @@ -13,15 +13,6 @@ #include -#define range_overflows(start, size, max) ({ \ - typeof(start) start__ = (start); \ - typeof(size) size__ = (size); \ - typeof(max) max__ = (max); \ - (void)(&start__ == &size__); \ - (void)(&start__ == &max__); \ - start__ >= max__ || size__ > max__ - start__; \ -}) - #define DRM_BUDDY_RANGE_ALLOCATION BIT(0) #define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1) #define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 154ed0dbb43f..725f95f7e416 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -238,6 +238,76 @@ static inline bool __must_check __must_check_overflow(bool overflow) __overflows_type_constexpr(n, T), \ __overflows_type(n, T)) +/** + * range_overflows() - Check if a range is out of bounds + * @start: Start of the range. + * @size: Size of the range. + * @max: Exclusive upper boundary. + * + * A strict check to determine if the range [@start, @start + @size) is + * invalid with respect to the allowable range [0, @max). Any range + * starting at or beyond @max is considered an overflow, even if @size is 0. + * + * Returns: true if the range is out of bounds. + */ +#define range_overflows(start, size, max) ({ \ + typeof(start) start__ = (start); \ + typeof(size) size__ = (size); \ + typeof(max) max__ = (max); \ + (void)(&start__ == &size__); \ + (void)(&start__ == &max__); \ + start__ >= max__ || size__ > max__ - start__; \ +}) + +/** + * range_overflows_t() - Check if a range is out of bounds + * @type: Data type to use. + * @start: Start of the range. + * @size: Size of the range. + * @max: Exclusive upper boundary. + * + * Same as range_overflows() but forcing the parameters to @type. + * + * Returns: true if the range is out of bounds. + */ +#define range_overflows_t(type, start, size, max) \ + range_overflows((type)(start), (type)(size), (type)(max)) + +/** + * range_end_overflows() - Check if a range's endpoint is out of bounds + * @start: Start of the range. + * @size: Size of the range. + * @max: Exclusive upper boundary. + * + * Checks only if the endpoint of a range (@start + @size) exceeds @max. + * Unlike range_overflows(), a zero-sized range at the boundary (@start == @max) + * is not considered an overflow. Useful for iterator-style checks. + * + * Returns: true if the endpoint exceeds the boundary. + */ +#define range_end_overflows(start, size, max) ({ \ + typeof(start) start__ = (start); \ + typeof(size) size__ = (size); \ + typeof(max) max__ = (max); \ + (void)(&start__ == &size__); \ + (void)(&start__ == &max__); \ + start__ > max__ || size__ > max__ - start__; \ +}) + +/** + * range_end_overflows_t() - Check if a range's endpoint is out of bounds + * @type: Data type to use. + * @start: Start of the range. + * @size: Size of the range. + * @max: Exclusive upper boundary. + * + * Same as range_end_overflows() but forcing the parameters to @type. + * + * Returns: true if the endpoint exceeds the boundary. + */ +#define range_end_overflows_t(type, start, size, max) \ + range_end_overflows((type)(start), (type)(size), (type)(max)) + /** * castable_to_type - like __same_type(), but also allows for casted literals * -- cgit v1.2.3 From 3ad2a7b9b15d5072139a20be84adb36776eb6c9b Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 5 Jun 2025 16:23:57 -0700 Subject: hwmon: Serialize accesses in hwmon core Implement locking in the hardware monitoring core for drivers using the _with_info() API functions. Most hardware monitoring drivers need to support locking to protect against parallel accesses from userspace. With older API functions, such locking had to be implemented in the driver code since sysfs attributes were created by the driver. However, the _with_info() API creates sysfs attributes in the hardware monitoring core. This makes it easy to move the locking primitives into that code. This has the benefit of simplifying driver code while at the same time reducing the risk of incomplete of bad locking implementations in hardware monitoring drivers. While this means that all accesses are forced to be synchronized, this has little if any practical impact since accesses are expected to be low frequency and are typically synchronized from userspace anyway since only a single process is accessing the data. On top of that, many drivers use regmap, which also has its own locking scheme and already serializes accesses. Signed-off-by: Guenter Roeck --- Documentation/hwmon/hwmon-kernel-api.rst | 10 ++++++++ drivers/hwmon/hwmon.c | 42 ++++++++++++++++++++++++++------ include/linux/hwmon.h | 3 +++ 3 files changed, 48 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/Documentation/hwmon/hwmon-kernel-api.rst b/Documentation/hwmon/hwmon-kernel-api.rst index 037b69c23cb5..1d7f1397a827 100644 --- a/Documentation/hwmon/hwmon-kernel-api.rst +++ b/Documentation/hwmon/hwmon-kernel-api.rst @@ -42,6 +42,9 @@ register/unregister functions:: char *devm_hwmon_sanitize_name(struct device *dev, const char *name); + void hwmon_lock(struct device *dev); + void hwmon_unlock(struct device *dev); + hwmon_device_register_with_info registers a hardware monitoring device. It creates the standard sysfs attributes in the hardware monitoring core, letting the driver focus on reading from and writing to the chip instead @@ -79,6 +82,13 @@ devm_hwmon_sanitize_name is the resource managed version of hwmon_sanitize_name; the memory will be freed automatically on device removal. +When using ``[devm_]hwmon_device_register_with_info()`` to register the +hardware monitoring device, accesses using the associated access functions +are serialised by the hardware monitoring core. If a driver needs locking +for other functions such as interrupt handlers or for attributes which are +fully implemented in the driver, hwmon_lock() and hwmon_unlock() can be used +to ensure that calls to those functions are serialized. + Using devm_hwmon_device_register_with_info() -------------------------------------------- diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c index 2e17f3a4c59b..0b4bdcd33c7b 100644 --- a/drivers/hwmon/hwmon.c +++ b/drivers/hwmon/hwmon.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,7 @@ struct hwmon_device { const char *label; struct device dev; const struct hwmon_chip_info *chip; + struct mutex lock; struct list_head tzdata; struct attribute_group group; const struct attribute_group **groups; @@ -165,6 +167,8 @@ static int hwmon_thermal_get_temp(struct thermal_zone_device *tz, int *temp) int ret; long t; + guard(mutex)(&hwdev->lock); + ret = hwdev->chip->ops->read(tdata->dev, hwmon_temp, hwmon_temp_input, tdata->index, &t); if (ret < 0) @@ -193,6 +197,8 @@ static int hwmon_thermal_set_trips(struct thermal_zone_device *tz, int low, int if (!info[i]) return 0; + guard(mutex)(&hwdev->lock); + if (info[i]->config[tdata->index] & HWMON_T_MIN) { err = chip->ops->write(tdata->dev, hwmon_temp, hwmon_temp_min, tdata->index, low); @@ -330,8 +336,6 @@ static int hwmon_attr_base(enum hwmon_sensor_types type) * attached to an i2c client device. */ -static DEFINE_MUTEX(hwmon_pec_mutex); - static int hwmon_match_device(struct device *dev, const void *data) { return dev->class == &hwmon_class; @@ -362,17 +366,16 @@ static ssize_t pec_store(struct device *dev, struct device_attribute *devattr, if (!hdev) return -ENODEV; - mutex_lock(&hwmon_pec_mutex); - /* * If there is no write function, we assume that chip specific * handling is not required. */ hwdev = to_hwmon_device(hdev); + guard(mutex)(&hwdev->lock); if (hwdev->chip->ops->write) { err = hwdev->chip->ops->write(hdev, hwmon_chip, hwmon_chip_pec, 0, val); if (err && err != -EOPNOTSUPP) - goto unlock; + goto put; } if (!val) @@ -381,8 +384,7 @@ static ssize_t pec_store(struct device *dev, struct device_attribute *devattr, client->flags |= I2C_CLIENT_PEC; err = count; -unlock: - mutex_unlock(&hwmon_pec_mutex); +put: put_device(hdev); return err; @@ -426,10 +428,13 @@ static ssize_t hwmon_attr_show(struct device *dev, struct device_attribute *devattr, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); s64 val64; long val; int ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->read(dev, hattr->type, hattr->attr, hattr->index, (hattr->type == hwmon_energy64) ? (long *)&val64 : &val); if (ret < 0) @@ -449,10 +454,13 @@ static ssize_t hwmon_attr_show_string(struct device *dev, char *buf) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); enum hwmon_sensor_types type = hattr->type; const char *s; int ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->read_string(dev, hattr->type, hattr->attr, hattr->index, &s); if (ret < 0) @@ -469,6 +477,7 @@ static ssize_t hwmon_attr_store(struct device *dev, const char *buf, size_t count) { struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr); + struct hwmon_device *hwdev = to_hwmon_device(dev); long val; int ret; @@ -476,6 +485,8 @@ static ssize_t hwmon_attr_store(struct device *dev, if (ret < 0) return ret; + guard(mutex)(&hwdev->lock); + ret = hattr->ops->write(dev, hattr->type, hattr->attr, hattr->index, val); if (ret < 0) @@ -791,6 +802,22 @@ int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, } EXPORT_SYMBOL_GPL(hwmon_notify_event); +void hwmon_lock(struct device *dev) +{ + struct hwmon_device *hwdev = to_hwmon_device(dev); + + mutex_lock(&hwdev->lock); +} +EXPORT_SYMBOL_GPL(hwmon_lock); + +void hwmon_unlock(struct device *dev) +{ + struct hwmon_device *hwdev = to_hwmon_device(dev); + + mutex_unlock(&hwdev->lock); +} +EXPORT_SYMBOL_GPL(hwmon_unlock); + static int hwmon_num_channel_attrs(const struct hwmon_channel_info *info) { int i, n; @@ -951,6 +978,7 @@ __hwmon_device_register(struct device *dev, const char *name, void *drvdata, tdev = tdev->parent; hdev->of_node = tdev ? tdev->of_node : NULL; hwdev->chip = chip; + mutex_init(&hwdev->lock); dev_set_drvdata(hdev, drvdata); dev_set_name(hdev, HWMON_ID_FORMAT, id); err = device_register(hdev); diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 886fc90b2d25..301a83afbd66 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -492,6 +492,9 @@ int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, char *hwmon_sanitize_name(const char *name); char *devm_hwmon_sanitize_name(struct device *dev, const char *name); +void hwmon_lock(struct device *dev); +void hwmon_unlock(struct device *dev); + /** * hwmon_is_bad_char - Is the char invalid in a hwmon name * @ch: the char to be considered -- cgit v1.2.3 From ca734f54b346db170be20ff3382bc3da8eb65904 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 17 Aug 2025 14:53:15 -0700 Subject: Input: pxa27x-keypad - drop support for platform data There are no in-kernel users of pxa27x_keypad_platform_data in the kernel, and the driver supports configuration via device tree, so drop support of static platform data and move properties parsing from OF-specific methods to generic ones. Link: https://lore.kernel.org/r/20250817215316.1872689-3-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/pxa27x_keypad.c | 378 +++++++++------------------- include/linux/platform_data/keypad-pxa27x.h | 73 ------ 2 files changed, 121 insertions(+), 330 deletions(-) delete mode 100644 include/linux/platform_data/keypad-pxa27x.h (limited to 'include/linux') diff --git a/drivers/input/keyboard/pxa27x_keypad.c b/drivers/input/keyboard/pxa27x_keypad.c index e6f6adb56de4..4519eecb317b 100644 --- a/drivers/input/keyboard/pxa27x_keypad.c +++ b/drivers/input/keyboard/pxa27x_keypad.c @@ -21,13 +21,13 @@ #include #include #include +#include #include #include #include #include #include -#include /* * Keypad Controller registers */ @@ -100,54 +100,69 @@ #define keypad_readl(off) __raw_readl(keypad->mmio_base + (off)) #define keypad_writel(off, v) __raw_writel((v), keypad->mmio_base + (off)) +#define MAX_MATRIX_KEY_ROWS 8 +#define MAX_MATRIX_KEY_COLS 8 +#define MAX_DIRECT_KEY_NUM 8 +#define MAX_ROTARY_ENCODERS 2 + #define MAX_MATRIX_KEY_NUM (MAX_MATRIX_KEY_ROWS * MAX_MATRIX_KEY_COLS) #define MAX_KEYPAD_KEYS (MAX_MATRIX_KEY_NUM + MAX_DIRECT_KEY_NUM) -struct pxa27x_keypad { - const struct pxa27x_keypad_platform_data *pdata; +struct pxa27x_keypad_rotary { + unsigned short *key_codes; + int rel_code; + bool enabled; +}; +struct pxa27x_keypad { struct clk *clk; struct input_dev *input_dev; void __iomem *mmio_base; int irq; - unsigned short keycodes[MAX_KEYPAD_KEYS]; - int rotary_rel_code[2]; - + unsigned int matrix_key_rows; + unsigned int matrix_key_cols; unsigned int row_shift; + unsigned int direct_key_num; + unsigned int direct_key_mask; + bool direct_key_low_active; + + /* key debounce interval */ + unsigned int debounce_interval; + + unsigned short keycodes[MAX_KEYPAD_KEYS]; + /* state row bits of each column scan */ u32 matrix_key_state[MAX_MATRIX_KEY_COLS]; u32 direct_key_state; - unsigned int direct_key_mask; + struct pxa27x_keypad_rotary rotary[MAX_ROTARY_ENCODERS]; }; -#ifdef CONFIG_OF -static int pxa27x_keypad_matrix_key_parse_dt(struct pxa27x_keypad *keypad, - struct pxa27x_keypad_platform_data *pdata) +static int pxa27x_keypad_matrix_key_parse(struct pxa27x_keypad *keypad) { struct input_dev *input_dev = keypad->input_dev; struct device *dev = input_dev->dev.parent; - u32 rows, cols; int error; - error = matrix_keypad_parse_properties(dev, &rows, &cols); + error = matrix_keypad_parse_properties(dev, &keypad->matrix_key_rows, + &keypad->matrix_key_cols); if (error) return error; - if (rows > MAX_MATRIX_KEY_ROWS || cols > MAX_MATRIX_KEY_COLS) { + if (keypad->matrix_key_rows > MAX_MATRIX_KEY_ROWS || + keypad->matrix_key_cols > MAX_MATRIX_KEY_COLS) { dev_err(dev, "rows or cols exceeds maximum value\n"); return -EINVAL; } - pdata->matrix_key_rows = rows; - pdata->matrix_key_cols = cols; + keypad->row_shift = get_count_order(keypad->matrix_key_cols); error = matrix_keypad_build_keymap(NULL, NULL, - pdata->matrix_key_rows, - pdata->matrix_key_cols, + keypad->matrix_key_rows, + keypad->matrix_key_cols, keypad->keycodes, input_dev); if (error) return error; @@ -155,20 +170,17 @@ static int pxa27x_keypad_matrix_key_parse_dt(struct pxa27x_keypad *keypad, return 0; } -static int pxa27x_keypad_direct_key_parse_dt(struct pxa27x_keypad *keypad, - struct pxa27x_keypad_platform_data *pdata) +static int pxa27x_keypad_direct_key_parse(struct pxa27x_keypad *keypad) { struct input_dev *input_dev = keypad->input_dev; struct device *dev = input_dev->dev.parent; - struct device_node *np = dev->of_node; - const __be16 *prop; unsigned short code; - unsigned int proplen, size; + int count; int i; int error; - error = of_property_read_u32(np, "marvell,direct-key-count", - &pdata->direct_key_num); + error = device_property_read_u32(dev, "marvell,direct-key-count", + &keypad->direct_key_num); if (error) { /* * If do not have marvel,direct-key-count defined, @@ -177,151 +189,121 @@ static int pxa27x_keypad_direct_key_parse_dt(struct pxa27x_keypad *keypad, return error == -EINVAL ? 0 : error; } - error = of_property_read_u32(np, "marvell,direct-key-mask", - &pdata->direct_key_mask); + error = device_property_read_u32(dev, "marvell,direct-key-mask", + &keypad->direct_key_mask); if (error) { if (error != -EINVAL) return error; /* * If marvell,direct-key-mask is not defined, driver will use - * default value. Default value is set when configure the keypad. + * a default value based on number of direct keys set up. + * The default value is calculated in pxa27x_keypad_config(). */ - pdata->direct_key_mask = 0; + keypad->direct_key_mask = 0; } - pdata->direct_key_low_active = of_property_read_bool(np, - "marvell,direct-key-low-active"); - - prop = of_get_property(np, "marvell,direct-key-map", &proplen); - if (!prop) - return -EINVAL; + keypad->direct_key_low_active = + device_property_read_bool(dev, "marvell,direct-key-low-active"); - if (proplen % sizeof(u16)) + count = device_property_count_u16(dev, "marvell,direct-key-map"); + if (count <= 0 || count > MAX_DIRECT_KEY_NUM) return -EINVAL; - size = proplen / sizeof(u16); - - /* Only MAX_DIRECT_KEY_NUM is accepted.*/ - if (size > MAX_DIRECT_KEY_NUM) - return -EINVAL; + error = device_property_read_u16_array(dev, "marvell,direct-key-map", + &keypad->keycodes[MAX_MATRIX_KEY_NUM], + count); - for (i = 0; i < size; i++) { - code = be16_to_cpup(prop + i); - keypad->keycodes[MAX_MATRIX_KEY_NUM + i] = code; + for (i = 0; i < count; i++) { + code = keypad->keycodes[MAX_MATRIX_KEY_NUM + i]; __set_bit(code, input_dev->keybit); } return 0; } -static int pxa27x_keypad_rotary_parse_dt(struct pxa27x_keypad *keypad, - struct pxa27x_keypad_platform_data *pdata) +static int pxa27x_keypad_rotary_parse(struct pxa27x_keypad *keypad) { - const __be32 *prop; - int i, relkey_ret; - unsigned int code, proplen; - const char *rotaryname[2] = { - "marvell,rotary0", "marvell,rotary1"}; - const char relkeyname[] = {"marvell,rotary-rel-key"}; + static const char * const rotaryname[] = { "marvell,rotary0", "marvell,rotary1" }; struct input_dev *input_dev = keypad->input_dev; struct device *dev = input_dev->dev.parent; - struct device_node *np = dev->of_node; - - relkey_ret = of_property_read_u32(np, relkeyname, &code); - /* if can read correct rotary key-code, we do not need this. */ - if (relkey_ret == 0) { - unsigned short relcode; + struct pxa27x_keypad_rotary *encoder; + unsigned int code; + int i; + int error; - /* rotary0 taks lower half, rotary1 taks upper half. */ - relcode = code & 0xffff; - pdata->rotary0_rel_code = (code & 0xffff); - __set_bit(relcode, input_dev->relbit); + error = device_property_read_u32(dev, "marvell,rotary-rel-key", &code); + if (!error) { + for (i = 0; i < MAX_ROTARY_ENCODERS; i++, code >>= 16) { + encoder = &keypad->rotary[i]; + encoder->enabled = true; + encoder->rel_code = code & 0xffff; + input_set_capability(input_dev, EV_REL, encoder->rel_code); + } - relcode = code >> 16; - pdata->rotary1_rel_code = relcode; - __set_bit(relcode, input_dev->relbit); + return 0; } - for (i = 0; i < 2; i++) { - prop = of_get_property(np, rotaryname[i], &proplen); + for (i = 0; i < MAX_ROTARY_ENCODERS; i++) { + encoder = &keypad->rotary[i]; + /* * If the prop is not set, it means keypad does not need * initialize the rotaryX. */ - if (!prop) + if (!device_property_present(dev, rotaryname[i])) continue; - code = be32_to_cpup(prop); + error = device_property_read_u32(dev, rotaryname[i], &code); + if (error) + return error; + /* * Not all up/down key code are valid. * Now we depends on direct-rel-code. */ - if ((!(code & 0xffff) || !(code >> 16)) && relkey_ret) { - return relkey_ret; - } else { - unsigned int n = MAX_MATRIX_KEY_NUM + (i << 1); - unsigned short keycode; - - keycode = code & 0xffff; - keypad->keycodes[n] = keycode; - __set_bit(keycode, input_dev->keybit); - - keycode = code >> 16; - keypad->keycodes[n + 1] = keycode; - __set_bit(keycode, input_dev->keybit); - - if (i == 0) - pdata->rotary0_rel_code = -1; - else - pdata->rotary1_rel_code = -1; - } - if (i == 0) - pdata->enable_rotary0 = 1; - else - pdata->enable_rotary1 = 1; - } + if (!(code & 0xffff) || !(code >> 16)) + return -EINVAL; + + encoder->enabled = true; + encoder->rel_code = -1; + encoder->key_codes = &keypad->keycodes[MAX_MATRIX_KEY_NUM + i * 2]; + encoder->key_codes[0] = code & 0xffff; + encoder->key_codes[1] = code >> 16; - keypad->rotary_rel_code[0] = pdata->rotary0_rel_code; - keypad->rotary_rel_code[1] = pdata->rotary1_rel_code; + input_set_capability(input_dev, EV_KEY, encoder->key_codes[0]); + input_set_capability(input_dev, EV_KEY, encoder->key_codes[1]); + } return 0; } -static int pxa27x_keypad_build_keycode_from_dt(struct pxa27x_keypad *keypad) +static int pxa27x_keypad_parse_properties(struct pxa27x_keypad *keypad) { struct input_dev *input_dev = keypad->input_dev; struct device *dev = input_dev->dev.parent; - struct device_node *np = dev->of_node; - struct pxa27x_keypad_platform_data *pdata; int error; - pdata = devm_kzalloc(dev, sizeof(*pdata), GFP_KERNEL); - if (!pdata) { - dev_err(dev, "failed to allocate memory for pdata\n"); - return -ENOMEM; - } - - error = pxa27x_keypad_matrix_key_parse_dt(keypad, pdata); + error = pxa27x_keypad_matrix_key_parse(keypad); if (error) { dev_err(dev, "failed to parse matrix key\n"); return error; } - error = pxa27x_keypad_direct_key_parse_dt(keypad, pdata); + error = pxa27x_keypad_direct_key_parse(keypad); if (error) { dev_err(dev, "failed to parse direct key\n"); return error; } - error = pxa27x_keypad_rotary_parse_dt(keypad, pdata); + error = pxa27x_keypad_rotary_parse(keypad); if (error) { dev_err(dev, "failed to parse rotary key\n"); return error; } - error = of_property_read_u32(np, "marvell,debounce-interval", - &pdata->debounce_interval); + error = device_property_read_u32(dev, "marvell,debounce-interval", + &keypad->debounce_interval); if (error) { dev_err(dev, "failed to parse debounce-interval\n"); return error; @@ -333,91 +315,11 @@ static int pxa27x_keypad_build_keycode_from_dt(struct pxa27x_keypad *keypad) */ input_dev->keycodemax = ARRAY_SIZE(keypad->keycodes); - keypad->pdata = pdata; - return 0; -} - -#else - -static int pxa27x_keypad_build_keycode_from_dt(struct pxa27x_keypad *keypad) -{ - dev_info(keypad->input_dev->dev.parent, "missing platform data\n"); - - return -EINVAL; -} - -#endif - -static int pxa27x_keypad_build_keycode(struct pxa27x_keypad *keypad) -{ - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; - struct input_dev *input_dev = keypad->input_dev; - unsigned short keycode; - int i; - int error; - - error = matrix_keypad_build_keymap(pdata->matrix_keymap_data, NULL, - pdata->matrix_key_rows, - pdata->matrix_key_cols, - keypad->keycodes, input_dev); - if (error) - return error; - - /* - * The keycodes may not only include matrix keys but also the direct - * or rotary keys. - */ - input_dev->keycodemax = ARRAY_SIZE(keypad->keycodes); - - /* For direct keys. */ - for (i = 0; i < pdata->direct_key_num; i++) { - keycode = pdata->direct_key_map[i]; - keypad->keycodes[MAX_MATRIX_KEY_NUM + i] = keycode; - __set_bit(keycode, input_dev->keybit); - } - - if (pdata->enable_rotary0) { - if (pdata->rotary0_up_key && pdata->rotary0_down_key) { - keycode = pdata->rotary0_up_key; - keypad->keycodes[MAX_MATRIX_KEY_NUM + 0] = keycode; - __set_bit(keycode, input_dev->keybit); - - keycode = pdata->rotary0_down_key; - keypad->keycodes[MAX_MATRIX_KEY_NUM + 1] = keycode; - __set_bit(keycode, input_dev->keybit); - - keypad->rotary_rel_code[0] = -1; - } else { - keypad->rotary_rel_code[0] = pdata->rotary0_rel_code; - __set_bit(pdata->rotary0_rel_code, input_dev->relbit); - } - } - - if (pdata->enable_rotary1) { - if (pdata->rotary1_up_key && pdata->rotary1_down_key) { - keycode = pdata->rotary1_up_key; - keypad->keycodes[MAX_MATRIX_KEY_NUM + 2] = keycode; - __set_bit(keycode, input_dev->keybit); - - keycode = pdata->rotary1_down_key; - keypad->keycodes[MAX_MATRIX_KEY_NUM + 3] = keycode; - __set_bit(keycode, input_dev->keybit); - - keypad->rotary_rel_code[1] = -1; - } else { - keypad->rotary_rel_code[1] = pdata->rotary1_rel_code; - __set_bit(pdata->rotary1_rel_code, input_dev->relbit); - } - } - - __clear_bit(KEY_RESERVED, input_dev->keybit); - return 0; } static void pxa27x_keypad_scan_matrix(struct pxa27x_keypad *keypad) { - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; struct input_dev *input_dev = keypad->input_dev; int row, col, num_keys_pressed = 0; u32 new_state[MAX_MATRIX_KEY_COLS]; @@ -435,8 +337,8 @@ static void pxa27x_keypad_scan_matrix(struct pxa27x_keypad *keypad) row = KPAS_RP(kpas); /* if invalid row/col, treat as no key pressed */ - if (col >= pdata->matrix_key_cols || - row >= pdata->matrix_key_rows) + if (col >= keypad->matrix_key_cols || + row >= keypad->matrix_key_rows) goto scan; new_state[col] = BIT(row); @@ -459,7 +361,7 @@ static void pxa27x_keypad_scan_matrix(struct pxa27x_keypad *keypad) new_state[7] = (kpasmkp3 >> 16) & KPASMKP_MKC_MASK; } scan: - for (col = 0; col < pdata->matrix_key_cols; col++) { + for (col = 0; col < keypad->matrix_key_cols; col++) { u32 bits_changed; int code; @@ -467,7 +369,7 @@ scan: if (bits_changed == 0) continue; - for (row = 0; row < pdata->matrix_key_rows; row++) { + for (row = 0; row < keypad->matrix_key_rows; row++) { if ((bits_changed & BIT(row)) == 0) continue; @@ -496,14 +398,16 @@ static inline int rotary_delta(u32 kprec) static void report_rotary_event(struct pxa27x_keypad *keypad, int r, int delta) { + struct pxa27x_keypad_rotary *encoder = &keypad->rotary[r]; struct input_dev *dev = keypad->input_dev; - if (delta == 0) + if (!encoder->enabled || delta == 0) return; - if (keypad->rotary_rel_code[r] == -1) { - int code = MAX_MATRIX_KEY_NUM + 2 * r + (delta > 0 ? 0 : 1); - unsigned char keycode = keypad->keycodes[code]; + if (encoder->rel_code == -1) { + int idx = delta > 0 ? 0 : 1; + int code = MAX_MATRIX_KEY_NUM + 2 * r + idx; + unsigned char keycode = encoder->key_codes[idx]; /* simulate a press-n-release */ input_event(dev, EV_MSC, MSC_SCAN, code); @@ -513,30 +417,28 @@ static void report_rotary_event(struct pxa27x_keypad *keypad, int r, int delta) input_report_key(dev, keycode, 0); input_sync(dev); } else { - input_report_rel(dev, keypad->rotary_rel_code[r], delta); + input_report_rel(dev, encoder->rel_code, delta); input_sync(dev); } } static void pxa27x_keypad_scan_rotary(struct pxa27x_keypad *keypad) { - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; u32 kprec; + int i; /* read and reset to default count value */ kprec = keypad_readl(KPREC); keypad_writel(KPREC, DEFAULT_KPREC); - if (pdata->enable_rotary0) + for (i = 0; i < MAX_ROTARY_ENCODERS; i++) { report_rotary_event(keypad, 0, rotary_delta(kprec)); - - if (pdata->enable_rotary1) - report_rotary_event(keypad, 1, rotary_delta(kprec >> 16)); + kprec >>= 16; + } } static void pxa27x_keypad_scan_direct(struct pxa27x_keypad *keypad) { - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; struct input_dev *input_dev = keypad->input_dev; unsigned int new_state; u32 kpdk, bits_changed; @@ -544,14 +446,14 @@ static void pxa27x_keypad_scan_direct(struct pxa27x_keypad *keypad) kpdk = keypad_readl(KPDK); - if (pdata->enable_rotary0 || pdata->enable_rotary1) + if (keypad->rotary[0].enabled || keypad->rotary[1].enabled) pxa27x_keypad_scan_rotary(keypad); /* * The KPDR_DK only output the key pin level, so it relates to board, * and low level may be active. */ - if (pdata->direct_key_low_active) + if (keypad->direct_key_low_active) new_state = ~KPDK_DK(kpdk) & keypad->direct_key_mask; else new_state = KPDK_DK(kpdk) & keypad->direct_key_mask; @@ -561,7 +463,7 @@ static void pxa27x_keypad_scan_direct(struct pxa27x_keypad *keypad) if (bits_changed == 0) return; - for (i = 0; i < pdata->direct_key_num; i++) { + for (i = 0; i < keypad->direct_key_num; i++) { if (bits_changed & BIT(i)) { int code = MAX_MATRIX_KEY_NUM + i; @@ -574,21 +476,11 @@ static void pxa27x_keypad_scan_direct(struct pxa27x_keypad *keypad) keypad->direct_key_state = new_state; } -static void clear_wakeup_event(struct pxa27x_keypad *keypad) -{ - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; - - if (pdata->clear_wakeup_event) - (pdata->clear_wakeup_event)(); -} - static irqreturn_t pxa27x_keypad_irq_handler(int irq, void *dev_id) { struct pxa27x_keypad *keypad = dev_id; unsigned long kpc = keypad_readl(KPC); - clear_wakeup_event(keypad); - if (kpc & KPC_DI) pxa27x_keypad_scan_direct(keypad); @@ -600,7 +492,6 @@ static irqreturn_t pxa27x_keypad_irq_handler(int irq, void *dev_id) static void pxa27x_keypad_config(struct pxa27x_keypad *keypad) { - const struct pxa27x_keypad_platform_data *pdata = keypad->pdata; unsigned int mask = 0, direct_key_num = 0; unsigned long kpc = 0; @@ -608,35 +499,33 @@ static void pxa27x_keypad_config(struct pxa27x_keypad *keypad) keypad_readl(KPC); /* enable matrix keys with automatic scan */ - if (pdata->matrix_key_rows && pdata->matrix_key_cols) { + if (keypad->matrix_key_rows && keypad->matrix_key_cols) { kpc |= KPC_ASACT | KPC_MIE | KPC_ME | KPC_MS_ALL; - kpc |= KPC_MKRN(pdata->matrix_key_rows) | - KPC_MKCN(pdata->matrix_key_cols); + kpc |= KPC_MKRN(keypad->matrix_key_rows) | + KPC_MKCN(keypad->matrix_key_cols); } /* enable rotary key, debounce interval same as direct keys */ - if (pdata->enable_rotary0) { + if (keypad->rotary[0].enabled) { mask |= 0x03; direct_key_num = 2; kpc |= KPC_REE0; } - if (pdata->enable_rotary1) { + if (keypad->rotary[1].enabled) { mask |= 0x0c; direct_key_num = 4; kpc |= KPC_REE1; } - if (pdata->direct_key_num > direct_key_num) - direct_key_num = pdata->direct_key_num; + if (keypad->direct_key_num > direct_key_num) + direct_key_num = keypad->direct_key_num; /* * Direct keys usage may not start from KP_DKIN0, check the platfrom * mask data to config the specific. */ - if (pdata->direct_key_mask) - keypad->direct_key_mask = pdata->direct_key_mask; - else + if (!keypad->direct_key_mask) keypad->direct_key_mask = GENMASK(direct_key_num - 1, 0) & ~mask; /* enable direct key */ @@ -645,7 +534,7 @@ static void pxa27x_keypad_config(struct pxa27x_keypad *keypad) keypad_writel(KPC, kpc | KPC_RE_ZERO_DEB); keypad_writel(KPREC, DEFAULT_KPREC); - keypad_writel(KPKDI, pdata->debounce_interval); + keypad_writel(KPKDI, keypad->debounce_interval); } static int pxa27x_keypad_open(struct input_dev *dev) @@ -719,19 +608,12 @@ static int pxa27x_keypad_resume(struct device *dev) static DEFINE_SIMPLE_DEV_PM_OPS(pxa27x_keypad_pm_ops, pxa27x_keypad_suspend, pxa27x_keypad_resume); - static int pxa27x_keypad_probe(struct platform_device *pdev) { - const struct pxa27x_keypad_platform_data *pdata = - dev_get_platdata(&pdev->dev); - struct device_node *np = pdev->dev.of_node; struct pxa27x_keypad *keypad; struct input_dev *input_dev; - int irq, error; - - /* Driver need build keycode from device tree or pdata */ - if (!np && !pdata) - return -EINVAL; + int irq; + int error; irq = platform_get_irq(pdev, 0); if (irq < 0) @@ -746,7 +628,6 @@ static int pxa27x_keypad_probe(struct platform_device *pdev) if (!input_dev) return -ENOMEM; - keypad->pdata = pdata; keypad->input_dev = input_dev; keypad->irq = irq; @@ -775,29 +656,12 @@ static int pxa27x_keypad_probe(struct platform_device *pdev) input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); input_set_capability(input_dev, EV_MSC, MSC_SCAN); - if (pdata) { - error = pxa27x_keypad_build_keycode(keypad); - } else { - error = pxa27x_keypad_build_keycode_from_dt(keypad); - /* - * Data that we get from DT resides in dynamically - * allocated memory so we need to update our pdata - * pointer. - */ - pdata = keypad->pdata; - } + error = pxa27x_keypad_parse_properties(keypad); if (error) { - dev_err(&pdev->dev, "failed to build keycode\n"); + dev_err(&pdev->dev, "failed to parse keypad properties\n"); return error; } - keypad->row_shift = get_count_order(pdata->matrix_key_cols); - - if ((pdata->enable_rotary0 && keypad->rotary_rel_code[0] != -1) || - (pdata->enable_rotary1 && keypad->rotary_rel_code[1] != -1)) { - input_dev->evbit[0] |= BIT_MASK(EV_REL); - } - error = devm_request_irq(&pdev->dev, irq, pxa27x_keypad_irq_handler, 0, pdev->name, keypad); if (error) { diff --git a/include/linux/platform_data/keypad-pxa27x.h b/include/linux/platform_data/keypad-pxa27x.h deleted file mode 100644 index a376442b9935..000000000000 --- a/include/linux/platform_data/keypad-pxa27x.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_ARCH_PXA27x_KEYPAD_H -#define __ASM_ARCH_PXA27x_KEYPAD_H - -#include -#include - -#define MAX_MATRIX_KEY_ROWS (8) -#define MAX_MATRIX_KEY_COLS (8) -#define MATRIX_ROW_SHIFT (3) -#define MAX_DIRECT_KEY_NUM (8) - -/* pxa3xx keypad platform specific parameters - * - * NOTE: - * 1. direct_key_num indicates the number of keys in the direct keypad - * _plus_ the number of rotary-encoder sensor inputs, this can be - * left as 0 if only rotary encoders are enabled, the driver will - * automatically calculate this - * - * 2. direct_key_map is the key code map for the direct keys, if rotary - * encoder(s) are enabled, direct key 0/1(2/3) will be ignored - * - * 3. rotary can be either interpreted as a relative input event (e.g. - * REL_WHEEL/REL_HWHEEL) or specific keys (e.g. UP/DOWN/LEFT/RIGHT) - * - * 4. matrix key and direct key will use the same debounce_interval by - * default, which should be sufficient in most cases - * - * pxa168 keypad platform specific parameter - * - * NOTE: - * clear_wakeup_event callback is a workaround required to clear the - * keypad interrupt. The keypad wake must be cleared in addition to - * reading the MI/DI bits in the KPC register. - */ -struct pxa27x_keypad_platform_data { - - /* code map for the matrix keys */ - const struct matrix_keymap_data *matrix_keymap_data; - unsigned int matrix_key_rows; - unsigned int matrix_key_cols; - - /* direct keys */ - int direct_key_num; - unsigned int direct_key_map[MAX_DIRECT_KEY_NUM]; - /* the key output may be low active */ - int direct_key_low_active; - /* give board a chance to choose the start direct key */ - unsigned int direct_key_mask; - - /* rotary encoders 0 */ - int enable_rotary0; - int rotary0_rel_code; - int rotary0_up_key; - int rotary0_down_key; - - /* rotary encoders 1 */ - int enable_rotary1; - int rotary1_rel_code; - int rotary1_up_key; - int rotary1_down_key; - - /* key debounce interval */ - unsigned int debounce_interval; - - /* clear wakeup event requirement for pxa168 */ - void (*clear_wakeup_event)(void); -}; - -extern void pxa_set_keypad_info(struct pxa27x_keypad_platform_data *info); - -#endif /* __ASM_ARCH_PXA27x_KEYPAD_H */ -- cgit v1.2.3 From 890ba82a60cb7a6584968a4ac15546f434afa40b Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 17 Aug 2025 14:39:06 -0700 Subject: Input: spear-keyboard - drop support for platform data There are no in-kernel users of spear kbd_platform_data in the kernel, and the driver supports configuration via device tree, so drop support of static platform data and move properties parsing from OF-specific methods to generic ones. Link: https://lore.kernel.org/r/vppjxui76im26uamznx7evm5lmbe3d6v3oxsa7mqyytykh4zm6@nhlf33v3hp6g Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/spear-keyboard.c | 71 +++--------- include/linux/platform_data/keyboard-spear.h | 164 --------------------------- 2 files changed, 15 insertions(+), 220 deletions(-) delete mode 100644 include/linux/platform_data/keyboard-spear.h (limited to 'include/linux') diff --git a/drivers/input/keyboard/spear-keyboard.c b/drivers/input/keyboard/spear-keyboard.c index 2fae337562a2..53f3ac64c980 100644 --- a/drivers/input/keyboard/spear-keyboard.c +++ b/drivers/input/keyboard/spear-keyboard.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -22,7 +23,6 @@ #include #include #include -#include /* Keyboard Registers */ #define MODE_CTL_REG 0x00 @@ -56,13 +56,12 @@ struct spear_kbd { void __iomem *io_base; struct clk *clk; unsigned int irq; - unsigned int mode; - unsigned int suspended_rate; + u32 mode; + u32 suspended_rate; + u32 mode_ctl_reg; unsigned short last_key; unsigned short keycodes[NUM_ROWS * NUM_COLS]; - bool rep; bool irq_wake_enabled; - u32 mode_ctl_reg; }; static irqreturn_t spear_kbd_interrupt(int irq, void *dev_id) @@ -143,46 +142,8 @@ static void spear_kbd_close(struct input_dev *dev) kbd->last_key = KEY_RESERVED; } -#ifdef CONFIG_OF -static int spear_kbd_parse_dt(struct platform_device *pdev, - struct spear_kbd *kbd) -{ - struct device_node *np = pdev->dev.of_node; - int error; - u32 val, suspended_rate; - - if (!np) { - dev_err(&pdev->dev, "Missing DT data\n"); - return -EINVAL; - } - - if (of_property_read_bool(np, "autorepeat")) - kbd->rep = true; - - if (of_property_read_u32(np, "suspended_rate", &suspended_rate)) - kbd->suspended_rate = suspended_rate; - - error = of_property_read_u32(np, "st,mode", &val); - if (error) { - dev_err(&pdev->dev, "DT: Invalid or missing mode\n"); - return error; - } - - kbd->mode = val; - return 0; -} -#else -static inline int spear_kbd_parse_dt(struct platform_device *pdev, - struct spear_kbd *kbd) -{ - return -ENOSYS; -} -#endif - static int spear_kbd_probe(struct platform_device *pdev) { - struct kbd_platform_data *pdata = dev_get_platdata(&pdev->dev); - const struct matrix_keymap_data *keymap = pdata ? pdata->keymap : NULL; struct spear_kbd *kbd; struct input_dev *input_dev; int irq; @@ -198,6 +159,14 @@ static int spear_kbd_probe(struct platform_device *pdev) return -ENOMEM; } + error = device_property_read_u32(&pdev->dev, "st,mode", &kbd->mode); + if (error) { + dev_err(&pdev->dev, "Invalid or missing mode\n"); + return error; + } + + device_property_read_u32(&pdev->dev, "suspended_rate", &kbd->suspended_rate); + input_dev = devm_input_allocate_device(&pdev->dev); if (!input_dev) { dev_err(&pdev->dev, "unable to allocate input device\n"); @@ -207,16 +176,6 @@ static int spear_kbd_probe(struct platform_device *pdev) kbd->input = input_dev; kbd->irq = irq; - if (!pdata) { - error = spear_kbd_parse_dt(pdev, kbd); - if (error) - return error; - } else { - kbd->mode = pdata->mode; - kbd->rep = pdata->rep; - kbd->suspended_rate = pdata->suspended_rate; - } - kbd->io_base = devm_platform_get_and_ioremap_resource(pdev, 0, NULL); if (IS_ERR(kbd->io_base)) return PTR_ERR(kbd->io_base); @@ -234,21 +193,21 @@ static int spear_kbd_probe(struct platform_device *pdev) input_dev->open = spear_kbd_open; input_dev->close = spear_kbd_close; - error = matrix_keypad_build_keymap(keymap, NULL, NUM_ROWS, NUM_COLS, + error = matrix_keypad_build_keymap(NULL, NULL, NUM_ROWS, NUM_COLS, kbd->keycodes, input_dev); if (error) { dev_err(&pdev->dev, "Failed to build keymap\n"); return error; } - if (kbd->rep) + if (device_property_read_bool(&pdev->dev, "autorepeat")) __set_bit(EV_REP, input_dev->evbit); input_set_capability(input_dev, EV_MSC, MSC_SCAN); input_set_drvdata(input_dev, kbd); error = devm_request_irq(&pdev->dev, irq, spear_kbd_interrupt, 0, - "keyboard", kbd); + "keyboard", kbd); if (error) { dev_err(&pdev->dev, "request_irq failed\n"); return error; diff --git a/include/linux/platform_data/keyboard-spear.h b/include/linux/platform_data/keyboard-spear.h deleted file mode 100644 index 5e3ff653900c..000000000000 --- a/include/linux/platform_data/keyboard-spear.h +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (C) 2010 ST Microelectronics - * Rajeev Kumar - * - * This file is licensed under the terms of the GNU General Public - * License version 2. This program is licensed "as is" without any - * warranty of any kind, whether express or implied. - */ - -#ifndef __PLAT_KEYBOARD_H -#define __PLAT_KEYBOARD_H - -#include -#include -#include -#include - -#define DECLARE_9x9_KEYMAP(_name) \ -int _name[] = { \ - KEY(0, 0, KEY_ESC), \ - KEY(0, 1, KEY_1), \ - KEY(0, 2, KEY_2), \ - KEY(0, 3, KEY_3), \ - KEY(0, 4, KEY_4), \ - KEY(0, 5, KEY_5), \ - KEY(0, 6, KEY_6), \ - KEY(0, 7, KEY_7), \ - KEY(0, 8, KEY_8), \ - KEY(1, 0, KEY_9), \ - KEY(1, 1, KEY_MINUS), \ - KEY(1, 2, KEY_EQUAL), \ - KEY(1, 3, KEY_BACKSPACE), \ - KEY(1, 4, KEY_TAB), \ - KEY(1, 5, KEY_Q), \ - KEY(1, 6, KEY_W), \ - KEY(1, 7, KEY_E), \ - KEY(1, 8, KEY_R), \ - KEY(2, 0, KEY_T), \ - KEY(2, 1, KEY_Y), \ - KEY(2, 2, KEY_U), \ - KEY(2, 3, KEY_I), \ - KEY(2, 4, KEY_O), \ - KEY(2, 5, KEY_P), \ - KEY(2, 6, KEY_LEFTBRACE), \ - KEY(2, 7, KEY_RIGHTBRACE), \ - KEY(2, 8, KEY_ENTER), \ - KEY(3, 0, KEY_LEFTCTRL), \ - KEY(3, 1, KEY_A), \ - KEY(3, 2, KEY_S), \ - KEY(3, 3, KEY_D), \ - KEY(3, 4, KEY_F), \ - KEY(3, 5, KEY_G), \ - KEY(3, 6, KEY_H), \ - KEY(3, 7, KEY_J), \ - KEY(3, 8, KEY_K), \ - KEY(4, 0, KEY_L), \ - KEY(4, 1, KEY_SEMICOLON), \ - KEY(4, 2, KEY_APOSTROPHE), \ - KEY(4, 3, KEY_GRAVE), \ - KEY(4, 4, KEY_LEFTSHIFT), \ - KEY(4, 5, KEY_BACKSLASH), \ - KEY(4, 6, KEY_Z), \ - KEY(4, 7, KEY_X), \ - KEY(4, 8, KEY_C), \ - KEY(5, 0, KEY_V), \ - KEY(5, 1, KEY_B), \ - KEY(5, 2, KEY_N), \ - KEY(5, 3, KEY_M), \ - KEY(5, 4, KEY_COMMA), \ - KEY(5, 5, KEY_DOT), \ - KEY(5, 6, KEY_SLASH), \ - KEY(5, 7, KEY_RIGHTSHIFT), \ - KEY(5, 8, KEY_KPASTERISK), \ - KEY(6, 0, KEY_LEFTALT), \ - KEY(6, 1, KEY_SPACE), \ - KEY(6, 2, KEY_CAPSLOCK), \ - KEY(6, 3, KEY_F1), \ - KEY(6, 4, KEY_F2), \ - KEY(6, 5, KEY_F3), \ - KEY(6, 6, KEY_F4), \ - KEY(6, 7, KEY_F5), \ - KEY(6, 8, KEY_F6), \ - KEY(7, 0, KEY_F7), \ - KEY(7, 1, KEY_F8), \ - KEY(7, 2, KEY_F9), \ - KEY(7, 3, KEY_F10), \ - KEY(7, 4, KEY_NUMLOCK), \ - KEY(7, 5, KEY_SCROLLLOCK), \ - KEY(7, 6, KEY_KP7), \ - KEY(7, 7, KEY_KP8), \ - KEY(7, 8, KEY_KP9), \ - KEY(8, 0, KEY_KPMINUS), \ - KEY(8, 1, KEY_KP4), \ - KEY(8, 2, KEY_KP5), \ - KEY(8, 3, KEY_KP6), \ - KEY(8, 4, KEY_KPPLUS), \ - KEY(8, 5, KEY_KP1), \ - KEY(8, 6, KEY_KP2), \ - KEY(8, 7, KEY_KP3), \ - KEY(8, 8, KEY_KP0), \ -} - -#define DECLARE_6x6_KEYMAP(_name) \ -int _name[] = { \ - KEY(0, 0, KEY_RESERVED), \ - KEY(0, 1, KEY_1), \ - KEY(0, 2, KEY_2), \ - KEY(0, 3, KEY_3), \ - KEY(0, 4, KEY_4), \ - KEY(0, 5, KEY_5), \ - KEY(1, 0, KEY_Q), \ - KEY(1, 1, KEY_W), \ - KEY(1, 2, KEY_E), \ - KEY(1, 3, KEY_R), \ - KEY(1, 4, KEY_T), \ - KEY(1, 5, KEY_Y), \ - KEY(2, 0, KEY_D), \ - KEY(2, 1, KEY_F), \ - KEY(2, 2, KEY_G), \ - KEY(2, 3, KEY_H), \ - KEY(2, 4, KEY_J), \ - KEY(2, 5, KEY_K), \ - KEY(3, 0, KEY_B), \ - KEY(3, 1, KEY_N), \ - KEY(3, 2, KEY_M), \ - KEY(3, 3, KEY_COMMA), \ - KEY(3, 4, KEY_DOT), \ - KEY(3, 5, KEY_SLASH), \ - KEY(4, 0, KEY_F6), \ - KEY(4, 1, KEY_F7), \ - KEY(4, 2, KEY_F8), \ - KEY(4, 3, KEY_F9), \ - KEY(4, 4, KEY_F10), \ - KEY(4, 5, KEY_NUMLOCK), \ - KEY(5, 0, KEY_KP2), \ - KEY(5, 1, KEY_KP3), \ - KEY(5, 2, KEY_KP0), \ - KEY(5, 3, KEY_KPDOT), \ - KEY(5, 4, KEY_RO), \ - KEY(5, 5, KEY_ZENKAKUHANKAKU), \ -} - -#define KEYPAD_9x9 0 -#define KEYPAD_6x6 1 -#define KEYPAD_2x2 2 - -/** - * struct kbd_platform_data - spear keyboard platform data - * keymap: pointer to keymap data (table and size) - * rep: enables key autorepeat - * mode: choose keyboard support(9x9, 6x6, 2x2) - * suspended_rate: rate at which keyboard would operate in suspended mode - * - * This structure is supposed to be used by platform code to supply - * keymaps to drivers that implement keyboards. - */ -struct kbd_platform_data { - const struct matrix_keymap_data *keymap; - bool rep; - unsigned int mode; - unsigned int suspended_rate; -}; - -#endif /* __PLAT_KEYBOARD_H */ -- cgit v1.2.3 From ad0d05dbddc1bf86e92220fea873176de6b12f78 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 30 Aug 2025 10:18:21 +0800 Subject: blk-mq: Defer freeing of tags page_list to SRCU callback Tag iterators can race with the freeing of the request pages(tags->page_list), potentially leading to use-after-free issues. Defer the freeing of the page list and the tags structure itself until after an SRCU grace period has passed. This ensures that any concurrent tag iterators have completed before the memory is released. With this way, we can replace the big tags->lock in tags iterator code path with srcu for solving the issue. This is achieved by: - Adding a new `srcu_struct tags_srcu` to `blk_mq_tag_set` to protect tag map iteration. - Adding an `rcu_head` to `struct blk_mq_tags` to be used with `call_srcu`. - Moving the page list freeing logic and the `kfree(tags)` call into a new callback function, `blk_mq_free_tags_callback`. - In `blk_mq_free_tags`, invoking `call_srcu` to schedule the new callback for deferred execution. The read-side protection for the tag iterators will be added in a subsequent patch. Reviewed-by: Hannes Reinecke Reviewed-by: Yu Kuai Signed-off-by: Ming Lei Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 24 +++++++++++++++++++++++- block/blk-mq.c | 26 +++++++++++++------------- include/linux/blk-mq.h | 2 ++ 3 files changed, 38 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index f09a4cbe486f..3c2ec6e86d54 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -8,6 +8,9 @@ */ #include #include +#include +#include +#include #include #include "blk.h" @@ -576,11 +579,30 @@ out_free_tags: return NULL; } +static void blk_mq_free_tags_callback(struct rcu_head *head) +{ + struct blk_mq_tags *tags = container_of(head, struct blk_mq_tags, + rcu_head); + struct page *page; + + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_alloc_rqs(). + */ + kmemleak_free(page_address(page)); + __free_pages(page, page->private); + } + kfree(tags); +} + void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags); - kfree(tags); + call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback); } int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-mq.c b/block/blk-mq.c index 5efa0712aac7..e1b44173029c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3454,7 +3454,6 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; - struct page *page; if (list_empty(&tags->page_list)) return; @@ -3478,17 +3477,10 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } blk_mq_clear_rq_mapping(drv_tags, tags); - - while (!list_empty(&tags->page_list)) { - page = list_first_entry(&tags->page_list, struct page, lru); - list_del_init(&page->lru); - /* - * Remove kmemleak object previously allocated in - * blk_mq_alloc_rqs(). - */ - kmemleak_free(page_address(page)); - __free_pages(page, page->private); - } + /* + * Free request pages in SRCU callback, which is called from + * blk_mq_free_tags(). + */ } void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) @@ -4834,6 +4826,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_srcu; } + ret = init_srcu_struct(&set->tags_srcu); + if (ret) + goto out_cleanup_srcu; init_rwsem(&set->update_nr_hwq_lock); @@ -4842,7 +4837,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) - goto out_cleanup_srcu; + goto out_cleanup_tags_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, @@ -4871,6 +4866,8 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_tags_srcu: + cleanup_srcu_struct(&set->tags_srcu); out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); @@ -4916,6 +4913,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + + srcu_barrier(&set->tags_srcu); + cleanup_srcu_struct(&set->tags_srcu); if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2a5a828f19a0..1325ceeb743a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -531,6 +531,7 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; struct srcu_struct *srcu; + struct srcu_struct tags_srcu; struct rw_semaphore update_nr_hwq_lock; }; @@ -767,6 +768,7 @@ struct blk_mq_tags { * request pool */ spinlock_t lock; + struct rcu_head rcu_head; }; static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, -- cgit v1.2.3 From 473efbc3ca29f725abfb7b323798df596ef41f3e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Sep 2025 08:18:15 -0600 Subject: io_uring/uring_cmd: fix __io_uring_cmd_do_in_task !CONFIG_IO_URING typo A manual application of this patch resulted in a typo for the stub function __io_uring_cmd_do_in_task(), for the case where CONFIG_IO_URING isn't true. Fix that up. Reported-by: Klara Modin Fixes: df3a7762ee24 ("io_uring/uring_cmd: add io_uring_cmd_tw_t type alias") Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 50dd6a53cb5e..1350af846ddd 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -109,8 +109,7 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_tw_t task_work_cb, - unsigned flags) + io_uring_cmd_tw_t task_work_cb, unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, -- cgit v1.2.3 From 9cf93a8fa9513c6d3cc65bdd50e05c1355cef322 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 14:04:24 -0500 Subject: ipmi: Allow an SMI sender to return an error Getting ready for handling when a BMC is non-responsive or broken, allow the sender operation to fail in an SMI. If it was a user-generated message it will return the error. The powernv code was already doing this internally, but the way it was written could result in deep stack descent if there were a lot of messages queued. Have its send return an error in this case. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_ipmb.c | 4 ++-- drivers/char/ipmi/ipmi_msghandler.c | 16 +++++++++++++--- drivers/char/ipmi/ipmi_powernv.c | 17 +++++++++-------- drivers/char/ipmi/ipmi_si_intf.c | 8 +++----- drivers/char/ipmi/ipmi_ssif.c | 4 ++-- include/linux/ipmi_smi.h | 10 ++++++---- 6 files changed, 35 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/ipmi/ipmi_ipmb.c b/drivers/char/ipmi/ipmi_ipmb.c index 6a4f279c7c1f..3a51e58b2487 100644 --- a/drivers/char/ipmi/ipmi_ipmb.c +++ b/drivers/char/ipmi/ipmi_ipmb.c @@ -404,8 +404,7 @@ static void ipmi_ipmb_shutdown(void *send_info) ipmi_ipmb_stop_thread(iidev); } -static void ipmi_ipmb_sender(void *send_info, - struct ipmi_smi_msg *msg) +static int ipmi_ipmb_sender(void *send_info, struct ipmi_smi_msg *msg) { struct ipmi_ipmb_dev *iidev = send_info; unsigned long flags; @@ -417,6 +416,7 @@ static void ipmi_ipmb_sender(void *send_info, spin_unlock_irqrestore(&iidev->lock, flags); up(&iidev->wake_thread); + return IPMI_CC_NO_ERROR; } static void ipmi_ipmb_request_events(void *send_info) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index c6a5673b6f01..a739c8822c91 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -4803,6 +4803,7 @@ static void smi_work(struct work_struct *t) int run_to_completion = READ_ONCE(intf->run_to_completion); struct ipmi_smi_msg *newmsg = NULL; struct ipmi_recv_msg *msg, *msg2; + int cc; /* * Start the next message if available. @@ -4811,7 +4812,7 @@ static void smi_work(struct work_struct *t) * because the lower layer is allowed to hold locks while calling * message delivery. */ - +restart: if (!run_to_completion) spin_lock_irqsave(&intf->xmit_msgs_lock, flags); if (intf->curr_msg == NULL && !intf->in_shutdown) { @@ -4832,8 +4833,17 @@ static void smi_work(struct work_struct *t) if (!run_to_completion) spin_unlock_irqrestore(&intf->xmit_msgs_lock, flags); - if (newmsg) - intf->handlers->sender(intf->send_info, newmsg); + if (newmsg) { + cc = intf->handlers->sender(intf->send_info, newmsg); + if (cc) { + if (newmsg->user_data) + deliver_err_response(intf, + newmsg->user_data, cc); + else + ipmi_free_smi_msg(newmsg); + goto restart; + } + } handle_new_recv_msgs(intf); diff --git a/drivers/char/ipmi/ipmi_powernv.c b/drivers/char/ipmi/ipmi_powernv.c index 4a2efafcd1f8..52a1130defe5 100644 --- a/drivers/char/ipmi/ipmi_powernv.c +++ b/drivers/char/ipmi/ipmi_powernv.c @@ -51,7 +51,7 @@ static void send_error_reply(struct ipmi_smi_powernv *smi, ipmi_smi_msg_received(smi->intf, msg); } -static void ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) +static int ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) { struct ipmi_smi_powernv *smi = send_info; struct opal_ipmi_msg *opal_msg; @@ -93,18 +93,19 @@ static void ipmi_powernv_send(void *send_info, struct ipmi_smi_msg *msg) smi->interface_id, opal_msg, size); rc = opal_ipmi_send(smi->interface_id, opal_msg, size); pr_devel("%s: -> %d\n", __func__, rc); - - if (!rc) { - smi->cur_msg = msg; - spin_unlock_irqrestore(&smi->msg_lock, flags); - return; + if (rc) { + comp = IPMI_ERR_UNSPECIFIED; + goto err_unlock; } - comp = IPMI_ERR_UNSPECIFIED; + smi->cur_msg = msg; + spin_unlock_irqrestore(&smi->msg_lock, flags); + return IPMI_CC_NO_ERROR; + err_unlock: spin_unlock_irqrestore(&smi->msg_lock, flags); err: - send_error_reply(smi, msg, comp); + return comp; } static int ipmi_powernv_recv(struct ipmi_smi_powernv *smi) diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 7b2ba318c547..88d62ef71b1b 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -809,8 +809,6 @@ restart: * this if there is not yet an upper layer to handle anything. */ if (si_sm_result == SI_SM_ATTN || smi_info->got_attn) { - unsigned char msg[2]; - if (smi_info->si_state != SI_NORMAL) { /* * We got an ATTN, but we are doing something else. @@ -907,8 +905,7 @@ static void flush_messages(void *send_info) } } -static void sender(void *send_info, - struct ipmi_smi_msg *msg) +static int sender(void *send_info, struct ipmi_smi_msg *msg) { struct smi_info *smi_info = send_info; unsigned long flags; @@ -921,7 +918,7 @@ static void sender(void *send_info, * layer will call flush_messages to clear it out. */ smi_info->waiting_msg = msg; - return; + return IPMI_CC_NO_ERROR; } spin_lock_irqsave(&smi_info->si_lock, flags); @@ -936,6 +933,7 @@ static void sender(void *send_info, smi_info->waiting_msg = msg; check_start_timer_thread(smi_info); spin_unlock_irqrestore(&smi_info->si_lock, flags); + return IPMI_CC_NO_ERROR; } static void set_run_to_completion(void *send_info, bool i_run_to_completion) diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index 1bc42830444d..1b63f7d2fcda 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -1068,8 +1068,7 @@ static void start_next_msg(struct ssif_info *ssif_info, unsigned long *flags) } } -static void sender(void *send_info, - struct ipmi_smi_msg *msg) +static int sender(void *send_info, struct ipmi_smi_msg *msg) { struct ssif_info *ssif_info = send_info; unsigned long oflags, *flags; @@ -1089,6 +1088,7 @@ static void sender(void *send_info, msg->data[0], msg->data[1], (long long)t.tv_sec, (long)t.tv_nsec / NSEC_PER_USEC); } + return IPMI_CC_NO_ERROR; } static int get_smi_info(void *send_info, struct ipmi_smi_info *data) diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index 5d69820d8b02..c2d975bbff60 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -109,8 +109,8 @@ struct ipmi_smi_msg { enum ipmi_smi_msg_type type; - long msgid; - void *user_data; + long msgid; + void *user_data; int data_size; unsigned char data[IPMI_MAX_MSG_LENGTH]; @@ -168,9 +168,11 @@ struct ipmi_smi_handlers { * are held when this is run. Message are delivered one at * a time by the message handler, a new message will not be * delivered until the previous message is returned. + * + * This can return an error if the SMI is not in a state where it + * can send a message. */ - void (*sender)(void *send_info, - struct ipmi_smi_msg *msg); + int (*sender)(void *send_info, struct ipmi_smi_msg *msg); /* * Called by the upper layer to request that we try to get -- cgit v1.2.3 From 3bc54ab3b9790ca92f197e9822e486665daa321c Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Wed, 20 Aug 2025 14:09:11 -0500 Subject: ipmi: Rename "user_data" to "recv_msg" in an SMI message It's only used to hold the corresponding receive message, so fix the name to make that clear and the type so nothing else can be accidentally assigned to it. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 26 +++++++++++++------------- include/linux/ipmi_smi.h | 3 ++- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index a739c8822c91..a0b67a35a5f0 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -1959,7 +1959,7 @@ static int i_ipmi_req_sysintf(struct ipmi_smi *intf, smi_msg->data[0] = (msg->netfn << 2) | (smi_addr->lun & 0x3); smi_msg->data[1] = msg->cmd; smi_msg->msgid = msgid; - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; if (msg->data_len > 0) memcpy(&smi_msg->data[2], msg->data, msg->data_len); smi_msg->data_size = msg->data_len + 2; @@ -2040,7 +2040,7 @@ static int i_ipmi_req_ipmb(struct ipmi_smi *intf, * Save the receive message so we can use it * to deliver the response. */ - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; } else { mutex_lock(&intf->seq_lock); @@ -2153,7 +2153,7 @@ static int i_ipmi_req_ipmb_direct(struct ipmi_smi *intf, memcpy(smi_msg->data + 4, msg->data, msg->data_len); smi_msg->data_size = msg->data_len + 4; - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; return 0; } @@ -2216,7 +2216,7 @@ static int i_ipmi_req_lan(struct ipmi_smi *intf, * Save the receive message so we can use it * to deliver the response. */ - smi_msg->user_data = recv_msg; + smi_msg->recv_msg = recv_msg; } else { mutex_lock(&intf->seq_lock); @@ -4066,7 +4066,7 @@ static int handle_ipmb_direct_rcv_rsp(struct ipmi_smi *intf, struct ipmi_recv_msg *recv_msg; struct ipmi_ipmb_direct_addr *daddr; - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; if (recv_msg == NULL) { dev_warn(intf->si_dev, "IPMI direct message received with no owner. This could be because of a malformed message, or because of a hardware error. Contact your hardware vendor for assistance.\n"); @@ -4489,7 +4489,7 @@ static int handle_bmc_rsp(struct ipmi_smi *intf, struct ipmi_recv_msg *recv_msg; struct ipmi_system_interface_addr *smi_addr; - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; if (recv_msg == NULL) { dev_warn(intf->si_dev, "IPMI SMI message received with no owner. This could be because of a malformed message, or because of a hardware error. Contact your hardware vendor for assistance.\n"); @@ -4563,14 +4563,14 @@ return_unspecified: } else if ((msg->data_size >= 2) && (msg->data[0] == (IPMI_NETFN_APP_REQUEST << 2)) && (msg->data[1] == IPMI_SEND_MSG_CMD) - && (msg->user_data == NULL)) { + && (msg->recv_msg == NULL)) { if (intf->in_shutdown || intf->run_to_completion) goto out; /* * This is the local response to a command send, start - * the timer for these. The user_data will not be + * the timer for these. The recv_msg will not be * NULL if this is a response send, and we will let * response sends just go through. */ @@ -4630,7 +4630,7 @@ return_unspecified: requeue = handle_ipmb_direct_rcv_rsp(intf, msg); } else if ((msg->rsp[0] == ((IPMI_NETFN_APP_REQUEST|1) << 2)) && (msg->rsp[1] == IPMI_SEND_MSG_CMD) - && (msg->user_data != NULL)) { + && (msg->recv_msg != NULL)) { /* * It's a response to a response we sent. For this we * deliver a send message response to the user. @@ -4647,7 +4647,7 @@ return_unspecified: cc = msg->rsp[2]; process_response_response: - recv_msg = msg->user_data; + recv_msg = msg->recv_msg; requeue = 0; if (!recv_msg) @@ -4836,9 +4836,9 @@ restart: if (newmsg) { cc = intf->handlers->sender(intf->send_info, newmsg); if (cc) { - if (newmsg->user_data) + if (newmsg->recv_msg) deliver_err_response(intf, - newmsg->user_data, cc); + newmsg->recv_msg, cc); else ipmi_free_smi_msg(newmsg); goto restart; @@ -5185,7 +5185,7 @@ struct ipmi_smi_msg *ipmi_alloc_smi_msg(void) rv = kmalloc(sizeof(struct ipmi_smi_msg), GFP_ATOMIC); if (rv) { rv->done = free_smi_msg; - rv->user_data = NULL; + rv->recv_msg = NULL; rv->type = IPMI_SMI_MSG_TYPE_NORMAL; atomic_inc(&smi_msg_inuse_count); } diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h index c2d975bbff60..892e2d656e1e 100644 --- a/include/linux/ipmi_smi.h +++ b/include/linux/ipmi_smi.h @@ -110,7 +110,8 @@ struct ipmi_smi_msg { enum ipmi_smi_msg_type type; long msgid; - void *user_data; + /* Response to this message, will be NULL if not from a user request. */ + struct ipmi_recv_msg *recv_msg; int data_size; unsigned char data[IPMI_MAX_MSG_LENGTH]; -- cgit v1.2.3 From e465ad7ef57aa1ec4122fd5b34c182d59629cb91 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Mon, 11 Aug 2025 08:48:08 -0400 Subject: clk: ti: dpll: convert from round_rate() to determine_rate() The round_rate() clk ops is deprecated, so migrate this driver from round_rate() to determine_rate(). Part of these changes were done using the Coccinelle semantic patch on the cover letter of this series, and the rest of the changes were manually done. omap4_dpll_regm4xen_round_rate() is now only called by omap4_dpll_regm4xen_determine_rate(), so let's merge that functionality into one function. This is needed for another cleanup to completely remove the round_rate() clk ops from the clk core. Tested-by: Anddreas Kemnade # OMAP3 GTA04, OMAP4 Panda Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: Brian Masney --- drivers/clk/ti/clkt_dpll.c | 28 +++++++-------- drivers/clk/ti/clock.h | 6 +--- drivers/clk/ti/dpll.c | 4 +-- drivers/clk/ti/dpll3xxx.c | 7 ++-- drivers/clk/ti/dpll44xx.c | 89 ++++++++++++++++------------------------------ include/linux/clk/ti.h | 8 ++--- 6 files changed, 57 insertions(+), 85 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/ti/clkt_dpll.c b/drivers/clk/ti/clkt_dpll.c index a8c0fc20f791..2ecd66968af4 100644 --- a/drivers/clk/ti/clkt_dpll.c +++ b/drivers/clk/ti/clkt_dpll.c @@ -268,10 +268,9 @@ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk) /* DPLL rate rounding code */ /** - * omap2_dpll_round_rate - round a target rate for an OMAP DPLL + * omap2_dpll_determine_rate - round a target rate for an OMAP DPLL * @hw: struct clk_hw containing the struct clk * for a DPLL - * @target_rate: desired DPLL clock rate - * @parent_rate: parent's DPLL clock rate + * @req: rate request * * Given a DPLL and a desired target rate, round the target rate to a * possible, programmable rate for this DPLL. Attempts to select the @@ -280,8 +279,7 @@ unsigned long omap2_get_dpll_rate(struct clk_hw_omap *clk) * (expensive) function again. Returns -EINVAL if the target rate * cannot be rounded, or the rounded rate upon success. */ -long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, - unsigned long *parent_rate) +int omap2_dpll_determine_rate(struct clk_hw *hw, struct clk_rate_request *req) { struct clk_hw_omap *clk = to_clk_hw_omap(hw); int m, n, r, scaled_max_m; @@ -299,15 +297,15 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, dd = clk->dpll_data; - if (dd->max_rate && target_rate > dd->max_rate) - target_rate = dd->max_rate; + if (dd->max_rate && req->rate > dd->max_rate) + req->rate = dd->max_rate; ref_rate = clk_hw_get_rate(dd->clk_ref); clk_name = clk_hw_get_name(hw); pr_debug("clock: %s: starting DPLL round_rate, target rate %lu\n", - clk_name, target_rate); + clk_name, req->rate); - scaled_rt_rp = target_rate / (ref_rate / DPLL_SCALE_FACTOR); + scaled_rt_rp = req->rate / (ref_rate / DPLL_SCALE_FACTOR); scaled_max_m = dd->max_multiplier * DPLL_SCALE_FACTOR; dd->last_rounded_rate = 0; @@ -332,7 +330,7 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, if (m > scaled_max_m) break; - r = _dpll_test_mult(&m, n, &new_rate, target_rate, + r = _dpll_test_mult(&m, n, &new_rate, req->rate, ref_rate); /* m can't be set low enough for this n - try with a larger n */ @@ -340,7 +338,7 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, continue; /* skip rates above our target rate */ - delta = target_rate - new_rate; + delta = req->rate - new_rate; if (delta < 0) continue; @@ -359,13 +357,15 @@ long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, if (prev_min_delta == LONG_MAX) { pr_debug("clock: %s: cannot round to rate %lu\n", - clk_name, target_rate); + clk_name, req->rate); return -EINVAL; } dd->last_rounded_m = min_delta_m; dd->last_rounded_n = min_delta_n; - dd->last_rounded_rate = target_rate - prev_min_delta; + dd->last_rounded_rate = req->rate - prev_min_delta; - return dd->last_rounded_rate; + req->rate = dd->last_rounded_rate; + + return 0; } diff --git a/drivers/clk/ti/clock.h b/drivers/clk/ti/clock.h index 2de7acea1ea0..d5e24fe4ae3a 100644 --- a/drivers/clk/ti/clock.h +++ b/drivers/clk/ti/clock.h @@ -273,8 +273,7 @@ int omap3_noncore_dpll_set_rate_and_parent(struct clk_hw *hw, u8 index); int omap3_noncore_dpll_determine_rate(struct clk_hw *hw, struct clk_rate_request *req); -long omap2_dpll_round_rate(struct clk_hw *hw, unsigned long target_rate, - unsigned long *parent_rate); +int omap2_dpll_determine_rate(struct clk_hw *hw, struct clk_rate_request *req); unsigned long omap3_clkoutx2_recalc(struct clk_hw *hw, unsigned long parent_rate); @@ -296,9 +295,6 @@ void omap3_clk_lock_dpll5(void); unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw, unsigned long parent_rate); -long omap4_dpll_regm4xen_round_rate(struct clk_hw *hw, - unsigned long target_rate, - unsigned long *parent_rate); int omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw, struct clk_rate_request *req); int omap2_clk_for_each(int (*fn)(struct clk_hw_omap *hw)); diff --git a/drivers/clk/ti/dpll.c b/drivers/clk/ti/dpll.c index 1f55554e0d73..971adafd9a8b 100644 --- a/drivers/clk/ti/dpll.c +++ b/drivers/clk/ti/dpll.c @@ -77,7 +77,7 @@ const struct clk_hw_omap_ops clkhwops_omap3_dpll = {}; static const struct clk_ops omap2_dpll_core_ck_ops = { .get_parent = &omap2_init_dpll_parent, .recalc_rate = &omap2_dpllcore_recalc, - .round_rate = &omap2_dpll_round_rate, + .determine_rate = &omap2_dpll_determine_rate, .set_rate = &omap2_reprogram_dpllcore, }; #else @@ -88,7 +88,7 @@ static const struct clk_ops omap2_dpll_core_ck_ops = {}; static const struct clk_ops omap3_dpll_core_ck_ops = { .get_parent = &omap2_init_dpll_parent, .recalc_rate = &omap3_dpll_recalc, - .round_rate = &omap2_dpll_round_rate, + .determine_rate = &omap2_dpll_determine_rate, }; static const struct clk_ops omap3_dpll_ck_ops = { diff --git a/drivers/clk/ti/dpll3xxx.c b/drivers/clk/ti/dpll3xxx.c index 00680486b1bd..8c51b988a04f 100644 --- a/drivers/clk/ti/dpll3xxx.c +++ b/drivers/clk/ti/dpll3xxx.c @@ -587,6 +587,7 @@ int omap3_noncore_dpll_determine_rate(struct clk_hw *hw, { struct clk_hw_omap *clk = to_clk_hw_omap(hw); struct dpll_data *dd; + int ret; if (!req->rate) return -EINVAL; @@ -599,8 +600,10 @@ int omap3_noncore_dpll_determine_rate(struct clk_hw *hw, (dd->modes & (1 << DPLL_LOW_POWER_BYPASS))) { req->best_parent_hw = dd->clk_bypass; } else { - req->rate = omap2_dpll_round_rate(hw, req->rate, - &req->best_parent_rate); + ret = omap2_dpll_determine_rate(hw, req); + if (ret != 0) + return ret; + req->best_parent_hw = dd->clk_ref; } diff --git a/drivers/clk/ti/dpll44xx.c b/drivers/clk/ti/dpll44xx.c index 3fc2cab69a3f..08ed57f181b4 100644 --- a/drivers/clk/ti/dpll44xx.c +++ b/drivers/clk/ti/dpll44xx.c @@ -133,61 +133,6 @@ unsigned long omap4_dpll_regm4xen_recalc(struct clk_hw *hw, return rate; } -/** - * omap4_dpll_regm4xen_round_rate - round DPLL rate, considering REGM4XEN bit - * @hw: struct hw_clk containing the struct clk * of the DPLL to round a rate for - * @target_rate: the desired rate of the DPLL - * @parent_rate: clock rate of the DPLL parent - * - * Compute the rate that would be programmed into the DPLL hardware - * for @clk if set_rate() were to be provided with the rate - * @target_rate. Takes the REGM4XEN bit into consideration, which is - * needed for the OMAP4 ABE DPLL. Returns the rounded rate (before - * M-dividers) upon success, -EINVAL if @clk is null or not a DPLL, or - * ~0 if an error occurred in omap2_dpll_round_rate(). - */ -long omap4_dpll_regm4xen_round_rate(struct clk_hw *hw, - unsigned long target_rate, - unsigned long *parent_rate) -{ - struct clk_hw_omap *clk = to_clk_hw_omap(hw); - struct dpll_data *dd; - long r; - - if (!clk || !clk->dpll_data) - return -EINVAL; - - dd = clk->dpll_data; - - dd->last_rounded_m4xen = 0; - - /* - * First try to compute the DPLL configuration for - * target rate without using the 4X multiplier. - */ - r = omap2_dpll_round_rate(hw, target_rate, NULL); - if (r != ~0) - goto out; - - /* - * If we did not find a valid DPLL configuration, try again, but - * this time see if using the 4X multiplier can help. Enabling the - * 4X multiplier is equivalent to dividing the target rate by 4. - */ - r = omap2_dpll_round_rate(hw, target_rate / OMAP4430_REGM4XEN_MULT, - NULL); - if (r == ~0) - return r; - - dd->last_rounded_rate *= OMAP4430_REGM4XEN_MULT; - dd->last_rounded_m4xen = 1; - -out: - omap4_dpll_lpmode_recalc(dd); - - return dd->last_rounded_rate; -} - /** * omap4_dpll_regm4xen_determine_rate - determine rate for a DPLL * @hw: pointer to the clock to determine rate for @@ -195,7 +140,7 @@ out: * * Determines which DPLL mode to use for reaching a desired rate. * Checks whether the DPLL shall be in bypass or locked mode, and if - * locked, calculates the M,N values for the DPLL via round-rate. + * locked, calculates the M,N values for the DPLL. * Returns 0 on success and a negative error value otherwise. */ int omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw, @@ -215,8 +160,36 @@ int omap4_dpll_regm4xen_determine_rate(struct clk_hw *hw, (dd->modes & (1 << DPLL_LOW_POWER_BYPASS))) { req->best_parent_hw = dd->clk_bypass; } else { - req->rate = omap4_dpll_regm4xen_round_rate(hw, req->rate, - &req->best_parent_rate); + struct clk_rate_request tmp_req; + long r; + + clk_hw_init_rate_request(hw, &tmp_req, req->rate); + dd->last_rounded_m4xen = 0; + + /* + * First try to compute the DPLL configuration for + * target rate without using the 4X multiplier. + */ + + r = omap2_dpll_determine_rate(hw, &tmp_req); + if (r < 0) { + /* + * If we did not find a valid DPLL configuration, try again, but + * this time see if using the 4X multiplier can help. Enabling the + * 4X multiplier is equivalent to dividing the target rate by 4. + */ + tmp_req.rate /= OMAP4430_REGM4XEN_MULT; + r = omap2_dpll_determine_rate(hw, &tmp_req); + if (r < 0) + return r; + + dd->last_rounded_rate *= OMAP4430_REGM4XEN_MULT; + dd->last_rounded_m4xen = 1; + } + + omap4_dpll_lpmode_recalc(dd); + + req->rate = dd->last_rounded_rate; req->best_parent_hw = dd->clk_ref; } diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index e656f63efdce..54a3fa370004 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -34,14 +34,14 @@ struct clk_omap_reg { * @clk_ref: struct clk_hw pointer to the clock's reference clock input * @control_reg: register containing the DPLL mode bitfield * @enable_mask: mask of the DPLL mode bitfield in @control_reg - * @last_rounded_rate: cache of the last rate result of omap2_dpll_round_rate() - * @last_rounded_m: cache of the last M result of omap2_dpll_round_rate() + * @last_rounded_rate: cache of the last rate result of omap2_dpll_determine_rate() + * @last_rounded_m: cache of the last M result of omap2_dpll_determine_rate() * @last_rounded_m4xen: cache of the last M4X result of - * omap4_dpll_regm4xen_round_rate() + * omap4_dpll_regm4xen_determine_rate() * @last_rounded_lpmode: cache of the last lpmode result of * omap4_dpll_lpmode_recalc() * @max_multiplier: maximum valid non-bypass multiplier value (actual) - * @last_rounded_n: cache of the last N result of omap2_dpll_round_rate() + * @last_rounded_n: cache of the last N result of omap2_dpll_determine_rate() * @min_divider: minimum valid non-bypass divider value (actual) * @max_divider: maximum valid non-bypass divider value (actual) * @max_rate: maximum clock rate for the DPLL -- cgit v1.2.3 From 3f5952917498e7bb9d227812d4349668f62c413b Mon Sep 17 00:00:00 2001 From: Per Larsen Date: Wed, 20 Aug 2025 01:10:09 +0000 Subject: KVM: arm64: Mask response to FFA_FEATURE call The minimum size and alignment boundary for FFA_RXTX_MAP is returned in bit[1:0]. Mask off any other bits in w2 when reading the minimum buffer size in hyp_ffa_post_init. Acked-by: Will Deacon Signed-off-by: Per Larsen Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/ffa.c | 2 +- include/linux/arm_ffa.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 96109aa99c48..a8ec1a94f3f8 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -739,7 +739,7 @@ static int hyp_ffa_post_init(void) if (res.a0 != FFA_SUCCESS) return -EOPNOTSUPP; - switch (res.a2) { + switch (res.a2 & FFA_FEAT_RXTX_MIN_SZ_MASK) { case FFA_FEAT_RXTX_MIN_SZ_4K: min_rxtx_sz = SZ_4K; break; diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index e1634897e159..cd7ee4df9045 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -128,6 +128,7 @@ #define FFA_FEAT_RXTX_MIN_SZ_4K 0 #define FFA_FEAT_RXTX_MIN_SZ_64K 1 #define FFA_FEAT_RXTX_MIN_SZ_16K 2 +#define FFA_FEAT_RXTX_MIN_SZ_MASK GENMASK(1, 0) /* FFA Bus/Device/Driver related */ struct ffa_device { -- cgit v1.2.3 From 6606c8c7e81886565f5cbdb0c0ce82e280c2b229 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 09:43:58 -0700 Subject: bitops: Add __attribute_const__ to generic ffs()-family implementations While tracking down a problem where constant expressions used by BUILD_BUG_ON() suddenly stopped working[1], we found that an added static initializer was convincing the compiler that it couldn't track the state of the prior statically initialized value. Tracing this down found that ffs() was used in the initializer macro, but since it wasn't marked with __attribute__const__, the compiler had to assume the function might change variable states as a side-effect (which is not true for ffs(), which provides deterministic math results). Add missing __attribute_const__ annotations to generic implementations of ffs(), __ffs(), fls(), and __fls() functions. These are pure mathematical functions that always return the same result for the same input with no side effects, making them eligible for compiler optimization. Build tested with x86_64 defconfig using GCC 14.2.0, which should validate the implementations when used by ARM, ARM64, LoongArch, Microblaze, NIOS2, and SPARC32 architectures. Link: https://github.com/KSPP/linux/issues/364 [1] Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250804164417.1612371-2-kees@kernel.org Signed-off-by: Kees Cook --- include/asm-generic/bitops/__ffs.h | 2 +- include/asm-generic/bitops/__fls.h | 2 +- include/asm-generic/bitops/builtin-__ffs.h | 2 +- include/asm-generic/bitops/builtin-__fls.h | 2 +- include/asm-generic/bitops/builtin-fls.h | 2 +- include/asm-generic/bitops/ffs.h | 2 +- include/asm-generic/bitops/fls.h | 2 +- include/asm-generic/bitops/fls64.h | 4 ++-- include/linux/bitops.h | 2 +- lib/clz_ctz.c | 8 ++++---- 10 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/bitops/__ffs.h b/include/asm-generic/bitops/__ffs.h index 2d08c750c8a7..3a899c626fdc 100644 --- a/include/asm-generic/bitops/__ffs.h +++ b/include/asm-generic/bitops/__ffs.h @@ -10,7 +10,7 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned int generic___ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned int generic___ffs(unsigned long word) { unsigned int num = 0; diff --git a/include/asm-generic/bitops/__fls.h b/include/asm-generic/bitops/__fls.h index e974ec932ec1..35f33780ca6c 100644 --- a/include/asm-generic/bitops/__fls.h +++ b/include/asm-generic/bitops/__fls.h @@ -10,7 +10,7 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned int generic___fls(unsigned long word) +static __always_inline __attribute_const__ unsigned int generic___fls(unsigned long word) { unsigned int num = BITS_PER_LONG - 1; diff --git a/include/asm-generic/bitops/builtin-__ffs.h b/include/asm-generic/bitops/builtin-__ffs.h index cf4b3d33bf96..d3c3f567045d 100644 --- a/include/asm-generic/bitops/builtin-__ffs.h +++ b/include/asm-generic/bitops/builtin-__ffs.h @@ -8,7 +8,7 @@ * * Undefined if no bit exists, so code should check against 0 first. */ -static __always_inline unsigned int __ffs(unsigned long word) +static __always_inline __attribute_const__ unsigned int __ffs(unsigned long word) { return __builtin_ctzl(word); } diff --git a/include/asm-generic/bitops/builtin-__fls.h b/include/asm-generic/bitops/builtin-__fls.h index 6d72fc8a5259..7770c4f1bfcd 100644 --- a/include/asm-generic/bitops/builtin-__fls.h +++ b/include/asm-generic/bitops/builtin-__fls.h @@ -8,7 +8,7 @@ * * Undefined if no set bit exists, so code should check against 0 first. */ -static __always_inline unsigned int __fls(unsigned long word) +static __always_inline __attribute_const__ unsigned int __fls(unsigned long word) { return (sizeof(word) * 8) - 1 - __builtin_clzl(word); } diff --git a/include/asm-generic/bitops/builtin-fls.h b/include/asm-generic/bitops/builtin-fls.h index c8455cc28841..be707da8c7cd 100644 --- a/include/asm-generic/bitops/builtin-fls.h +++ b/include/asm-generic/bitops/builtin-fls.h @@ -9,7 +9,7 @@ * This is defined the same way as ffs. * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int fls(unsigned int x) +static __always_inline __attribute_const__ int fls(unsigned int x) { return x ? sizeof(x) * 8 - __builtin_clz(x) : 0; } diff --git a/include/asm-generic/bitops/ffs.h b/include/asm-generic/bitops/ffs.h index 4c43f242daeb..5ff2b7fbda6d 100644 --- a/include/asm-generic/bitops/ffs.h +++ b/include/asm-generic/bitops/ffs.h @@ -10,7 +10,7 @@ * the libc and compiler builtin ffs routines, therefore * differs in spirit from ffz (man ffs). */ -static inline int generic_ffs(int x) +static inline __attribute_const__ int generic_ffs(int x) { int r = 1; diff --git a/include/asm-generic/bitops/fls.h b/include/asm-generic/bitops/fls.h index 26f3ce1dd6e4..8eed3437edb9 100644 --- a/include/asm-generic/bitops/fls.h +++ b/include/asm-generic/bitops/fls.h @@ -10,7 +10,7 @@ * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ -static __always_inline int generic_fls(unsigned int x) +static __always_inline __attribute_const__ int generic_fls(unsigned int x) { int r = 32; diff --git a/include/asm-generic/bitops/fls64.h b/include/asm-generic/bitops/fls64.h index 866f2b2304ff..b5f58dd261a3 100644 --- a/include/asm-generic/bitops/fls64.h +++ b/include/asm-generic/bitops/fls64.h @@ -16,7 +16,7 @@ * at position 64. */ #if BITS_PER_LONG == 32 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { __u32 h = x >> 32; if (h) @@ -24,7 +24,7 @@ static __always_inline int fls64(__u64 x) return fls(x); } #elif BITS_PER_LONG == 64 -static __always_inline int fls64(__u64 x) +static __always_inline __attribute_const__ int fls64(__u64 x) { if (x == 0) return 0; diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 9be2d50da09a..ea7898cc5903 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -267,7 +267,7 @@ static inline int parity8(u8 val) * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ -static inline unsigned int __ffs64(u64 word) +static inline __attribute_const__ unsigned int __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) diff --git a/lib/clz_ctz.c b/lib/clz_ctz.c index fb8c0c5c2bd2..8778ec44bf63 100644 --- a/lib/clz_ctz.c +++ b/lib/clz_ctz.c @@ -15,28 +15,28 @@ #include int __weak __ctzsi2(int val); -int __weak __ctzsi2(int val) +int __weak __attribute_const__ __ctzsi2(int val) { return __ffs(val); } EXPORT_SYMBOL(__ctzsi2); int __weak __clzsi2(int val); -int __weak __clzsi2(int val) +int __weak __attribute_const__ __clzsi2(int val) { return 32 - fls(val); } EXPORT_SYMBOL(__clzsi2); int __weak __clzdi2(u64 val); -int __weak __clzdi2(u64 val) +int __weak __attribute_const__ __clzdi2(u64 val) { return 64 - fls64(val); } EXPORT_SYMBOL(__clzdi2); int __weak __ctzdi2(u64 val); -int __weak __ctzdi2(u64 val) +int __weak __attribute_const__ __ctzdi2(u64 val) { return __ffs64(val); } -- cgit v1.2.3 From 2d0ddbb65cef99aab241378b0f4ff2d6ea8c3a5a Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 31 Aug 2025 09:04:06 -0700 Subject: Drivers: hv: Simplify data structures for VMBus channel close message struct vmbus_close_msg is used for sending the VMBus channel close message. It contains a struct vmbus_channel_msginfo, which has a flex array member at the end. The latter's presence in the middle of struct vmbus_close_msg causes warnings when built with -Wflex-array-member-not-at-end. But the struct vmbus_channel_msginfo is unused because the Hyper-V host does not send a response to the channel close message. So remove the struct vmbus_channel_msginfo. Then, since the only remaining field is struct vmbus_channel_close_channel, also remove the containing struct vmbus_close_msg and directly use struct vmbus_channel_close_channel. Besides eliminating unnecessary complexity, these changes resolve the -Wflex-array-member-not-at-end warnings. Signed-off-by: Michael Kelley Reviewed-by: Tianyu Lan Signed-off-by: Wei Liu --- drivers/hv/channel.c | 2 +- include/linux/hyperv.h | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 7c7c66e0dc3f..162d6aeece7b 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -925,7 +925,7 @@ static int vmbus_close_internal(struct vmbus_channel *channel) /* Send a closing message */ - msg = &channel->close_msg.msg; + msg = &channel->close_msg; msg->header.msgtype = CHANNELMSG_CLOSECHANNEL; msg->child_relid = channel->offermsg.child_relid; diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index a59c5c3e95fb..59826c89171c 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -707,11 +707,6 @@ struct vmbus_channel_msginfo { unsigned char msg[]; }; -struct vmbus_close_msg { - struct vmbus_channel_msginfo info; - struct vmbus_channel_close_channel msg; -}; - enum vmbus_device_type { HV_IDE = 0, HV_SCSI, @@ -800,7 +795,7 @@ struct vmbus_channel { struct hv_ring_buffer_info outbound; /* send to parent */ struct hv_ring_buffer_info inbound; /* receive from parent */ - struct vmbus_close_msg close_msg; + struct vmbus_channel_close_channel close_msg; /* Statistics */ u64 interrupts; /* Host to Guest interrupts */ -- cgit v1.2.3 From ff97bc38be343e4530e2f140b40cbdce2e09152f Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 3 Sep 2025 10:30:00 +0300 Subject: net/mlx5: Add RS FEC histogram infrastructure Define the Ports Phy Histogram Configuration Register (PPHCR) to expose RS-FEC histogram bin ranges, and expose a new counter group in the Ports Performance Counters Register (PPCNT) to report the corresponding histogram values. Co-developed-by: Yael Chemla Signed-off-by: Yael Chemla Signed-off-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1756884600-520195-1-git-send-email-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 1 + include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 29 +++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 72a83666e67f..d7f46a8fbfa1 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1525,6 +1525,7 @@ enum { MLX5_PHYSICAL_LAYER_RECOVERY_GROUP = 0x1a, MLX5_INFINIBAND_PORT_COUNTERS_GROUP = 0x20, MLX5_INFINIBAND_EXTENDED_PORT_COUNTERS_GROUP = 0x21, + MLX5_RS_FEC_HISTOGRAM_GROUP = 0x23, }; enum { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c5fbfb85749..c0858af0e854 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -130,6 +130,7 @@ enum { MLX5_REG_PDDR = 0x5031, MLX5_REG_PMLP = 0x5002, MLX5_REG_PPLM = 0x5023, + MLX5_REG_PPHCR = 0x503E, MLX5_REG_PCAM = 0x507f, MLX5_REG_NODE_DESC = 0x6001, MLX5_REG_HOST_ENDIANNESS = 0x7004, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e9f14a0c7f4f..097b1b7ada63 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4901,6 +4901,11 @@ union mlx5_ifc_field_select_802_1_r_roce_auto_bits { u8 reserved_at_0[0x20]; }; +struct mlx5_ifc_rs_histogram_cntrs_bits { + u8 hist[16][0x40]; + u8 reserved_at_400[0x2c0]; +}; + union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits eth_802_3_cntrs_grp_data_layout; struct mlx5_ifc_eth_2863_cntrs_grp_data_layout_bits eth_2863_cntrs_grp_data_layout; @@ -4915,6 +4920,7 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { struct mlx5_ifc_phys_layer_cntrs_bits phys_layer_cntrs; struct mlx5_ifc_phys_layer_statistical_cntrs_bits phys_layer_statistical_cntrs; struct mlx5_ifc_phys_layer_recovery_cntrs_bits phys_layer_recovery_cntrs; + struct mlx5_ifc_rs_histogram_cntrs_bits rs_histogram_cntrs; u8 reserved_at_0[0x7c0]; }; @@ -11738,6 +11744,28 @@ struct mlx5_ifc_mtctr_reg_bits { u8 second_clock_timestamp[0x40]; }; +struct mlx5_ifc_bin_range_layout_bits { + u8 reserved_at_0[0xa]; + u8 high_val[0x6]; + u8 reserved_at_10[0xa]; + u8 low_val[0x6]; +}; + +struct mlx5_ifc_pphcr_reg_bits { + u8 active_hist_type[0x4]; + u8 reserved_at_4[0x4]; + u8 local_port[0x8]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x8]; + u8 num_of_bins[0x8]; + u8 reserved_at_30[0x10]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_bin_range_layout_bits bin_range[16]; +}; + union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_bufferx_reg_bits bufferx_reg; struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; @@ -11804,6 +11832,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mtmp_reg_bits mtmp_reg; struct mlx5_ifc_mtptm_reg_bits mtptm_reg; struct mlx5_ifc_mtctr_reg_bits mtctr_reg; + struct mlx5_ifc_pphcr_reg_bits pphcr_reg; u8 reserved_at_0[0x60e0]; }; -- cgit v1.2.3 From cdea7cdae26995992a766443e1ea862923f2443d Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 21 Aug 2025 15:28:14 +0200 Subject: hrtimer: Use hrtimer_cb_get_time() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various other helpers contain open-coded implementations of hrtimer_cb_get_time(). This prevents refactoring the implementation. Reuse the existing helper. For this to work, move hrtimer_cb_get_time() a bit up in the file and also make its argument 'const'. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-7-3ae822e5bfbd@linutronix.de --- include/linux/hrtimer.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 1ef867bb8c44..e655502b14e6 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -154,14 +154,14 @@ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) return ktime_to_ns(timer->node.expires); } -static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) +static inline ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) { - return ktime_sub(timer->node.expires, timer->base->get_time()); + return timer->base->get_time(); } -static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) +static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { - return timer->base->get_time(); + return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) @@ -200,8 +200,7 @@ __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { - return __hrtimer_expires_remaining_adjusted(timer, - timer->base->get_time()); + return __hrtimer_expires_remaining_adjusted(timer, hrtimer_cb_get_time(timer)); } #ifdef CONFIG_TIMERFD @@ -363,7 +362,7 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { - return hrtimer_forward(timer, timer->base->get_time(), interval); + return hrtimer_forward(timer, hrtimer_cb_get_time(timer), interval); } /* Precise sleep: */ -- cgit v1.2.3 From 009eb5da29a91016e3ebb988e6401e79411be7a1 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Thu, 21 Aug 2025 15:28:15 +0200 Subject: hrtimer: Remove hrtimer_clock_base:: Get_time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The get_time() callbacks always need to match the bases clockid. Instead of maintaining that association twice in hrtimer_bases, use a helper. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250821-hrtimer-cleanup-get_time-v2-8-3ae822e5bfbd@linutronix.de --- include/linux/hrtimer.h | 5 +---- include/linux/hrtimer_defs.h | 2 -- kernel/time/hrtimer.c | 34 +++++++++++++++++++++++++--------- kernel/time/timer_list.c | 2 -- scripts/gdb/linux/timerlist.py | 2 -- 5 files changed, 26 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index e655502b14e6..2cf1bf65b225 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -154,10 +154,7 @@ static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) return ktime_to_ns(timer->node.expires); } -static inline ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) -{ - return timer->base->get_time(); -} +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 84a5045f80f3..aa49ffa130e5 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -41,7 +41,6 @@ * @seq: seqcount around __run_hrtimer * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers - * @get_time: function to retrieve the current time of the clock * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { @@ -51,7 +50,6 @@ struct hrtimer_clock_base { seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; - ktime_t (*get_time)(void); ktime_t offset; } __hrtimer_clock_base_align; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c..fedd1d793f6c 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -59,6 +59,7 @@ #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) static void retrigger_next_event(void *arg); +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); /* * The timer bases: @@ -76,42 +77,34 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, }, }, .csd = CSD_INIT(retrigger_next_event, NULL) @@ -1253,7 +1246,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, base->get_time()); + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); tim = hrtimer_update_lowres(timer, tim, mode); @@ -1588,6 +1581,29 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) } } +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) +{ + switch (clock_id) { + case CLOCK_MONOTONIC: + return ktime_get(); + case CLOCK_REALTIME: + return ktime_get_real(); + case CLOCK_BOOTTIME: + return ktime_get_boottime(); + case CLOCK_TAI: + return ktime_get_clocktai(); + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return ktime_get(); + } +} + +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) +{ + return __hrtimer_cb_get_time(timer->base->clockid); +} +EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); + static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index b03d0ada6469..488e47e96e93 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -102,8 +102,6 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - - SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py index 98445671fe83..ccc24d30de80 100644 --- a/scripts/gdb/linux/timerlist.py +++ b/scripts/gdb/linux/timerlist.py @@ -56,8 +56,6 @@ def print_base(base): text += " .index: {}\n".format(base['index']) text += " .resolution: {} nsecs\n".format(constants.LX_hrtimer_resolution) - - text += " .get_time: {}\n".format(base['get_time']) if constants.LX_CONFIG_HIGH_RES_TIMERS: text += " .offset: {} nsecs\n".format(base['offset']) text += "active timers:\n" -- cgit v1.2.3 From 70a6f71b1a77decfc5b1db426ccbe914b58adb38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Sep 2025 12:56:38 +0200 Subject: block: add a bio_init_inline helper Just a simpler wrapper around bio_init for callers that want to initialize a bio with inline bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Yu Kuai Signed-off-by: Jens Axboe --- block/bio.c | 7 +++++-- block/blk-crypto-fallback.c | 3 +-- block/blk-map.c | 8 ++++---- drivers/md/bcache/debug.c | 3 +-- drivers/md/bcache/io.c | 3 +-- drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/movinggc.c | 2 +- drivers/md/bcache/super.c | 2 +- drivers/md/bcache/writeback.c | 2 +- drivers/md/dm-bufio.c | 2 +- drivers/md/dm-flakey.c | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 4 ++-- drivers/target/target_core_pscsi.c | 2 +- fs/bcachefs/btree_io.c | 2 +- fs/bcachefs/journal.c | 2 +- fs/bcachefs/journal_io.c | 2 +- fs/bcachefs/super-io.c | 2 +- fs/squashfs/block.c | 2 +- include/linux/bio.h | 5 +++++ 20 files changed, 32 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 44c43b970387..bd8bf179981a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -462,7 +462,10 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache->nr--; put_cpu(); - bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf); + if (nr_vecs) + bio_init_inline(bio, bdev, nr_vecs, opf); + else + bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } @@ -578,7 +581,7 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, bio_init(bio, bdev, bvl, nr_vecs, opf); } else if (nr_vecs) { - bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf); + bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); } else { bio_init(bio, bdev, NULL, 0, opf); } diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 005c9157ffb3..dbc2d8784dab 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -167,8 +167,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) bio = bio_kmalloc(nr_segs, GFP_NOIO); if (!bio) return NULL; - bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs, - bio_src->bi_opf); + bio_init_inline(bio, bio_src->bi_bdev, nr_segs, bio_src->bi_opf); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; diff --git a/block/blk-map.c b/block/blk-map.c index 23e5d5ebe59e..92b7e19b1a1f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -157,7 +157,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); + bio_init_inline(bio, NULL, nr_pages, req_op(rq)); if (map_data) { nr_pages = 1U << map_data->page_order; @@ -264,7 +264,7 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return NULL; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + bio_init_inline(bio, NULL, nr_vecs, req_op(rq)); } return bio; } @@ -326,7 +326,7 @@ static struct bio *bio_map_kern(void *data, unsigned int len, enum req_op op, bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, op); + bio_init_inline(bio, NULL, nr_vecs, op); if (is_vmalloc_addr(data)) { bio->bi_private = data; if (!bio_add_vmalloc(bio, data, len)) { @@ -392,7 +392,7 @@ static struct bio *bio_copy_kern(void *data, unsigned int len, enum req_op op, bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); - bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, op); + bio_init_inline(bio, NULL, nr_pages, op); while (len) { struct page *page; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 7510d1c983a5..f327456fc4e0 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -115,8 +115,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(nr_segs, GFP_NOIO); if (!check) return; - bio_init(check, bio->bi_bdev, check->bi_inline_vecs, nr_segs, - REQ_OP_READ); + bio_init_inline(check, bio->bi_bdev, nr_segs, REQ_OP_READ); check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 020712c5203f..2386d08bf4e4 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -26,8 +26,7 @@ struct bio *bch_bbio_alloc(struct cache_set *c) struct bbio *b = mempool_alloc(&c->bio_meta, GFP_NOIO); struct bio *bio = &b->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, - meta_bucket_pages(&c->cache->sb), 0); + bio_init_inline(bio, NULL, meta_bucket_pages(&c->cache->sb), 0); return bio; } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 7ff14bd2feb8..d50eb82ccb4f 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -615,7 +615,7 @@ static void do_journal_discard(struct cache *ca) atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); - bio_init(bio, ca->bdev, bio->bi_inline_vecs, 1, REQ_OP_DISCARD); + bio_init_inline(bio, ca->bdev, 1, REQ_OP_DISCARD); bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); bio->bi_iter.bi_size = bucket_bytes(ca); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 26a6a535ec32..4fc80c6d5b31 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -79,7 +79,7 @@ static void moving_init(struct moving_io *io) { struct bio *bio = &io->bio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1492c8552255..6d250e366412 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2236,7 +2236,7 @@ static int cache_alloc(struct cache *ca) __module_get(THIS_MODULE); kobject_init(&ca->kobj, &bch_cache_ktype); - bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); + bio_init_inline(&ca->journal.bio, NULL, 8, 0); /* * When the cache disk is first registered, ca->sb.njournal_buckets diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 302e75f1fc4b..36dd8f14a6df 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -331,7 +331,7 @@ static void dirty_init(struct keybuf_key *w) struct dirty_io *io = w->private; struct bio *bio = &io->bio; - bio_init(bio, NULL, bio->bi_inline_vecs, + bio_init_inline(bio, NULL, DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index ff7595caf440..8f3a23f4b168 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1342,7 +1342,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, use_dmio(b, op, sector, n_sectors, offset, ioprio); return; } - bio_init(bio, b->c->bdev, bio->bi_inline_vecs, 1, op); + bio_init_inline(bio, b->c->bdev, 1, op); bio->bi_iter.bi_sector = sector; bio->bi_end_io = bio_complete; bio->bi_private = b; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index cf17fd46e255..08925aca838c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -441,7 +441,7 @@ static struct bio *clone_bio(struct dm_target *ti, struct flakey_c *fc, struct b if (!clone) return NULL; - bio_init(clone, fc->dev->bdev, clone->bi_inline_vecs, nr_iovecs, bio->bi_opf); + bio_init_inline(clone, fc->dev->bdev, nr_iovecs, bio->bi_opf); clone->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); clone->bi_private = bio; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 408c26398321..bc11aaa38615 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -167,7 +167,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r1_bio->bios[j] = bio; } /* diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b60c30bfb6c7..c52ccd12d86b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -163,14 +163,14 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].bio = bio; if (!conf->have_replacement) continue; bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); if (!bio) goto out_free_bio; - bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); + bio_init_inline(bio, NULL, RESYNC_PAGES, 0); r10_bio->devs[j].repl_bio = bio; } /* diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index f991cf759836..db4e09042469 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -861,7 +861,7 @@ new_bio: bio = bio_kmalloc(nr_vecs, GFP_KERNEL); if (!bio) goto fail; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, + bio_init_inline(bio, NULL, nr_vecs, rw ? REQ_OP_WRITE : REQ_OP_READ); bio->bi_end_io = pscsi_bi_endio; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 590cd29f3e86..e701e6d6e321 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -2084,7 +2084,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, INIT_WORK(&scrub->work, btree_node_scrub_work); - bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); + bio_init_inline(&scrub->bio, ca->disk_sb.bdev, vecs, REQ_OP_READ); bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); scrub->bio.bi_iter.bi_sector = pick.ptr.offset; scrub->bio.bi_end_io = btree_node_scrub_endio; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index ddfeb0dafc9d..3dbf9faaaa4c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1634,7 +1634,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bio[i]->ca = ca; ja->bio[i]->buf_idx = i; - bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); + bio_init_inline(&ja->bio[i]->bio, NULL, nr_bvecs, 0); } ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 9e028dbcc3d0..ce8888d518c3 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1071,7 +1071,7 @@ reread: bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!bio) return bch_err_throw(c, ENOMEM_journal_read_bucket); - bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); + bio_init_inline(bio, ca->disk_sb.bdev, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, buf->data, sectors_read << 9); diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 6c2e1d647403..e3fcffb93d2b 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -232,7 +232,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) if (!bio) return -BCH_ERR_ENOMEM_sb_bio_realloc; - bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); + bio_init_inline(bio, NULL, nr_bvecs, 0); kfree(sb->bio); sb->bio = bio; diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index b69c294e3ef0..a05e3793f93a 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -231,7 +231,7 @@ static int squashfs_bio_read(struct super_block *sb, u64 index, int length, bio = bio_kmalloc(page_count, GFP_NOIO); if (!bio) return -ENOMEM; - bio_init(bio, sb->s_bdev, bio->bi_inline_vecs, page_count, REQ_OP_READ); + bio_init_inline(bio, sb->s_bdev, page_count, REQ_OP_READ); bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); for (i = 0; i < page_count; ++i) { diff --git a/include/linux/bio.h b/include/linux/bio.h index 46ffac5caab7..eb7f4fbd8aa9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -405,6 +405,11 @@ struct request_queue; void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf); +static inline void bio_init_inline(struct bio *bio, struct block_device *bdev, + unsigned short max_vecs, blk_opf_t opf) +{ + bio_init(bio, bdev, bio->bi_inline_vecs, max_vecs, opf); +} extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); -- cgit v1.2.3 From d86eaa0f3c56da286853b698b45c8ce404291082 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Sep 2025 12:56:39 +0200 Subject: block: remove the bi_inline_vecs variable sized array from struct bio Bios are embedded into other structures, and at least spare is unhappy about embedding structures with variable sized arrays. There's no real need to the array anyway, we can replace it with a helper pointing to the memory just behind the bio, and with the previous cleanups there is very few site doing anything special with it. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- drivers/md/bcache/movinggc.c | 6 +++--- drivers/md/bcache/writeback.c | 6 +++--- drivers/md/dm-vdo/vio.c | 2 +- fs/bcachefs/data_update.h | 1 - fs/bcachefs/journal.c | 4 ++-- include/linux/bio.h | 2 +- include/linux/blk_types.h | 12 +++++------- 8 files changed, 17 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index bd8bf179981a..971d96afaf8d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -617,7 +617,8 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; - return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); + return kmalloc(sizeof(*bio) + nr_vecs * sizeof(struct bio_vec), + gfp_mask); } EXPORT_SYMBOL(bio_kmalloc); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 4fc80c6d5b31..73918e55bf04 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -145,9 +145,9 @@ static void read_moving(struct cache_set *c) continue; } - io = kzalloc(struct_size(io, bio.bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 36dd8f14a6df..6ba73dc1a3df 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -536,9 +536,9 @@ static void read_dirty(struct cached_dev *dc) for (i = 0; i < nk; i++) { w = keys[i]; - io = kzalloc(struct_size(io, bio.bi_inline_vecs, - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)), - GFP_KERNEL); + io = kzalloc(sizeof(*io) + sizeof(struct bio_vec) * + DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + GFP_KERNEL); if (!io) goto err; diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e7f4153e55e3..8fc22fb14196 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -212,7 +212,7 @@ int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t return VDO_SUCCESS; bio->bi_ioprio = 0; - bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_io_vec = bio_inline_vecs(bio); bio->bi_max_vecs = vio->block_count + 1; if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", size, vio_size) != VDO_SUCCESS) diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 5e14d13568de..1b37780abfda 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -62,7 +62,6 @@ struct promote_op { struct work_struct work; struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ }; void bch2_data_update_to_text(struct printbuf *, struct data_update *); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 3dbf9faaaa4c..474e0e867b68 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1627,8 +1627,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, - nr_bvecs), GFP_KERNEL); + ja->bio[i] = kzalloc(sizeof(*ja->bio[i]) + + sizeof(struct bio_vec) * nr_bvecs, GFP_KERNEL); if (!ja->bio[i]) return bch_err_throw(c, ENOMEM_dev_journal_init); diff --git a/include/linux/bio.h b/include/linux/bio.h index eb7f4fbd8aa9..27cbff5b0356 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -408,7 +408,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, static inline void bio_init_inline(struct bio *bio, struct block_device *bdev, unsigned short max_vecs, blk_opf_t opf) { - bio_init(bio, bdev, bio->bi_inline_vecs, max_vecs, opf); + bio_init(bio, bdev, bio_inline_vecs(bio), max_vecs, opf); } extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 930daff207df..bbb7893e0542 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -269,18 +269,16 @@ struct bio { struct bio_vec *bi_io_vec; /* the actual vec list */ struct bio_set *bi_pool; - - /* - * We can inline a number of vecs at the end of the bio, to avoid - * double allocations for a small number of bio_vecs. This member - * MUST obviously be kept at the very end of the bio. - */ - struct bio_vec bi_inline_vecs[]; }; #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) #define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT) +static inline struct bio_vec *bio_inline_vecs(struct bio *bio) +{ + return (struct bio_vec *)(bio + 1); +} + /* * bio flags */ -- cgit v1.2.3 From 199c9a8d26638845f509b76e3c176c27e7baafd7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 9 Sep 2025 20:33:10 +0800 Subject: blk-mq: Document tags_srcu member in blk_mq_tag_set structure Add missing documentation for the tags_srcu member that was introduced to defer freeing of tags page_list to prevent use-after-free when iterating tags. Fixes htmldocs warning: WARNING: include/linux/blk-mq.h:536 struct member 'tags_srcu' not described in 'blk_mq_tag_set' Reported-by: Stephen Rothwell Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1325ceeb743a..b25d12545f46 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -507,6 +507,8 @@ enum hctx_type { * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). + * @tags_srcu: SRCU used to defer freeing of tags page_list to prevent + * use-after-free when iterating tags. * @update_nr_hwq_lock: * Synchronize updating nr_hw_queues with add/del disk & * switching elevator. -- cgit v1.2.3 From e8c8d961d8ad53d8d104d7474c08d8e4626c7767 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 24 Jun 2025 08:41:15 +0200 Subject: media: include: update Hans Verkuil's email address Replace hverkuil@xs4all.nl by hverkuil@kernel.org. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/dt-bindings/media/tvp5150.h | 2 +- include/linux/videodev2.h | 2 +- include/media/drv-intf/cx25840.h | 2 +- include/media/drv-intf/msp3400.h | 2 +- include/media/i2c/bt819.h | 2 +- include/media/i2c/cs5345.h | 2 +- include/media/i2c/cs53l32a.h | 2 +- include/media/i2c/m52790.h | 2 +- include/media/i2c/mt9v011.h | 2 +- include/media/i2c/saa7115.h | 2 +- include/media/i2c/saa7127.h | 2 +- include/media/i2c/tvaudio.h | 2 +- include/media/i2c/upd64031a.h | 2 +- include/media/i2c/upd64083.h | 2 +- include/media/i2c/wm8775.h | 2 +- include/media/v4l2-common.h | 2 +- include/media/v4l2-ctrls.h | 2 +- include/media/v4l2-device.h | 2 +- include/media/v4l2-subdev.h | 2 +- include/uapi/linux/ivtv.h | 2 +- include/uapi/linux/videodev2.h | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/dt-bindings/media/tvp5150.h b/include/dt-bindings/media/tvp5150.h index dda00c038530..ba34c420c303 100644 --- a/include/dt-bindings/media/tvp5150.h +++ b/include/dt-bindings/media/tvp5150.h @@ -2,7 +2,7 @@ /* tvp5150.h - definition for tvp5150 inputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 219037f4c08d..9609cf365e8e 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -50,7 +50,7 @@ * * Author: Bill Dirks * Justin Schoeman - * Hans Verkuil + * Hans Verkuil * et al. */ #ifndef __LINUX_VIDEODEV2_H diff --git a/include/media/drv-intf/cx25840.h b/include/media/drv-intf/cx25840.h index ba69bc525382..8b455d9dd5ca 100644 --- a/include/media/drv-intf/cx25840.h +++ b/include/media/drv-intf/cx25840.h @@ -3,7 +3,7 @@ /* * cx25840.h - definition for cx25840/1/2/3 inputs * - * Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + * Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ #ifndef _CX25840_H_ diff --git a/include/media/drv-intf/msp3400.h b/include/media/drv-intf/msp3400.h index d6dfae104a6f..853258ee6bbd 100644 --- a/include/media/drv-intf/msp3400.h +++ b/include/media/drv-intf/msp3400.h @@ -2,7 +2,7 @@ /* msp3400.h - definition for msp3400 inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/bt819.h b/include/media/i2c/bt819.h index 70aa46bd5182..2277a7eb9548 100644 --- a/include/media/i2c/bt819.h +++ b/include/media/i2c/bt819.h @@ -2,7 +2,7 @@ /* bt819.h - bt819 notifications - Copyright (C) 2009 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2009 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/cs5345.h b/include/media/i2c/cs5345.h index d41e4dca3fcc..39e1cf6c1a2f 100644 --- a/include/media/i2c/cs5345.h +++ b/include/media/i2c/cs5345.h @@ -2,7 +2,7 @@ /* cs5345.h - definition for cs5345 inputs and outputs - Copyright (C) 2007 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2007 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/cs53l32a.h b/include/media/i2c/cs53l32a.h index 52ceb2f916d3..777f667855cb 100644 --- a/include/media/i2c/cs53l32a.h +++ b/include/media/i2c/cs53l32a.h @@ -2,7 +2,7 @@ /* cs53l32a.h - definition for cs53l32a inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/m52790.h b/include/media/i2c/m52790.h index 3f214fa9bc64..cedaaf215273 100644 --- a/include/media/i2c/m52790.h +++ b/include/media/i2c/m52790.h @@ -2,7 +2,7 @@ /* m52790.h - definition for m52790 inputs and outputs - Copyright (C) 2007 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2007 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/mt9v011.h b/include/media/i2c/mt9v011.h index 41c00b3e7184..552839756e64 100644 --- a/include/media/i2c/mt9v011.h +++ b/include/media/i2c/mt9v011.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* mt9v011 sensor * - * Copyright (C) 2011 Hans Verkuil + * Copyright (C) 2011 Hans Verkuil */ #ifndef __MT9V011_H__ diff --git a/include/media/i2c/saa7115.h b/include/media/i2c/saa7115.h index 0cd6080d7cb1..a607c91ef5f3 100644 --- a/include/media/i2c/saa7115.h +++ b/include/media/i2c/saa7115.h @@ -2,7 +2,7 @@ /* saa7115.h - definition for saa7111/3/4/5 inputs and frequency flags - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/saa7127.h b/include/media/i2c/saa7127.h index 53ee999e6090..c81ee1743df1 100644 --- a/include/media/i2c/saa7127.h +++ b/include/media/i2c/saa7127.h @@ -2,7 +2,7 @@ /* saa7127.h - definition for saa7126/7/8/9 inputs/outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/tvaudio.h b/include/media/i2c/tvaudio.h index 42cd3206fb6c..206f42ed4e69 100644 --- a/include/media/i2c/tvaudio.h +++ b/include/media/i2c/tvaudio.h @@ -2,7 +2,7 @@ /* tvaudio.h - definition for tvaudio inputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/upd64031a.h b/include/media/i2c/upd64031a.h index b6570abc84ef..d39b2b7f0cf3 100644 --- a/include/media/i2c/upd64031a.h +++ b/include/media/i2c/upd64031a.h @@ -2,7 +2,7 @@ /* * upd64031a - NEC Electronics Ghost Reduction input defines * - * 2006 by Hans Verkuil (hverkuil@xs4all.nl) + * 2006 by Hans Verkuil (hverkuil@kernel.org) */ #ifndef _UPD64031A_H_ diff --git a/include/media/i2c/upd64083.h b/include/media/i2c/upd64083.h index 17fb7b5201cc..72cf547c25fc 100644 --- a/include/media/i2c/upd64083.h +++ b/include/media/i2c/upd64083.h @@ -2,7 +2,7 @@ /* * upd6408x - NEC Electronics 3-Dimensional Y/C separation input defines * - * 2006 by Hans Verkuil (hverkuil@xs4all.nl) + * 2006 by Hans Verkuil (hverkuil@kernel.org) */ #ifndef _UPD64083_H_ diff --git a/include/media/i2c/wm8775.h b/include/media/i2c/wm8775.h index 6ccdeb3817ab..a02695ee3a58 100644 --- a/include/media/i2c/wm8775.h +++ b/include/media/i2c/wm8775.h @@ -2,7 +2,7 @@ /* wm8775.h - definition for wm8775 inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index d8e23991a656..594314459333 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -7,7 +7,7 @@ Each ioctl begins with VIDIOC_INT_ to clearly mark that it is an internal define, - Copyright (C) 2005 Hans Verkuil + Copyright (C) 2005 Hans Verkuil */ diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index 4a294a5c7bdd..31fc1bee3797 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -2,7 +2,7 @@ /* * V4L2 controls support header. * - * Copyright (C) 2010 Hans Verkuil + * Copyright (C) 2010 Hans Verkuil */ #ifndef _V4L2_CTRLS_H diff --git a/include/media/v4l2-device.h b/include/media/v4l2-device.h index dd897a362f36..25f69b1b8db0 100644 --- a/include/media/v4l2-device.h +++ b/include/media/v4l2-device.h @@ -2,7 +2,7 @@ /* V4L2 device support header. - Copyright (C) 2008 Hans Verkuil + Copyright (C) 2008 Hans Verkuil */ diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index 4b28086808c9..e0bb58cb6d04 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -2,7 +2,7 @@ /* * V4L2 sub-device support header. * - * Copyright (C) 2008 Hans Verkuil + * Copyright (C) 2008 Hans Verkuil */ #ifndef _V4L2_SUBDEV_H diff --git a/include/uapi/linux/ivtv.h b/include/uapi/linux/ivtv.h index e74f18642b11..c9241f7271c4 100644 --- a/include/uapi/linux/ivtv.h +++ b/include/uapi/linux/ivtv.h @@ -2,7 +2,7 @@ /* Public ivtv API header Copyright (C) 2003-2004 Kevin Thayer - Copyright (C) 2004-2007 Hans Verkuil + Copyright (C) 2004-2007 Hans Verkuil This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 64943f1a6149..becd08fdbddb 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -51,7 +51,7 @@ * * Author: Bill Dirks * Justin Schoeman - * Hans Verkuil + * Hans Verkuil * et al. */ #ifndef _UAPI__LINUX_VIDEODEV2_H -- cgit v1.2.3 From fec2e705729dc93de5399d8b139e4746805c3d81 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:51 -0700 Subject: block: check for valid bio while splitting We're already iterating every segment, so check these for a valid IO lengths at the same time. Individual segment lengths will not be checked on passthrough commands. The read/write command segments must be sized to the dma alignment. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- block/blk-merge.c | 21 +++++++++++++++++---- include/linux/bio.h | 4 ++-- include/linux/blkdev.h | 7 +++++++ 4 files changed, 27 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/block/blk-map.c b/block/blk-map.c index 92b7e19b1a1f..ca4f5cf00038 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -443,7 +443,7 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) int ret; /* check that the data layout matches the hardware restrictions */ - ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes); + ret = bio_split_io_at(bio, lim, &nr_segs, max_bytes, 0); if (ret) { /* if we would have to split the bio, copy instead */ if (ret > 0) diff --git a/block/blk-merge.c b/block/blk-merge.c index 70d704615be5..cffc0fe48d8a 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -279,25 +279,30 @@ static unsigned int bio_split_alignment(struct bio *bio, } /** - * bio_split_rw_at - check if and where to split a read/write bio + * bio_split_io_at - check if and where to split a bio * @bio: [in] bio to be split * @lim: [in] queue limits to split based on * @segs: [out] number of segments in the bio with the first half of the sectors * @max_bytes: [in] maximum number of bytes per bio + * @len_align_mask: [in] length alignment mask for each vector * * Find out if @bio needs to be split to fit the queue limits in @lim and a * maximum size of @max_bytes. Returns a negative error number if @bio can't be * split, 0 if the bio doesn't have to be split, or a positive sector offset if * @bio needs to be split. */ -int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, unsigned max_bytes) +int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes, unsigned len_align_mask) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; unsigned nsegs = 0, bytes = 0; bio_for_each_bvec(bv, bio, iter) { + if (bv.bv_offset & lim->dma_alignment || + bv.bv_len & len_align_mask) + return -EINVAL; + /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. @@ -339,8 +344,16 @@ split: * Individual bvecs might not be logical block aligned. Round down the * split size so that each bio is properly block size aligned, even if * we do not use the full hardware limits. + * + * It is possible to submit a bio that can't be split into a valid io: + * there may either be too many discontiguous vectors for the max + * segments limit, or contain virtual boundary gaps without having a + * valid block sized split. A zero byte result means one of those + * conditions occured. */ bytes = ALIGN_DOWN(bytes, bio_split_alignment(bio, lim)); + if (!bytes) + return -EINVAL; /* * Bio splitting may cause subtle trouble such as hang when doing sync @@ -350,7 +363,7 @@ split: bio_clear_polled(bio); return bytes >> SECTOR_SHIFT; } -EXPORT_SYMBOL_GPL(bio_split_rw_at); +EXPORT_SYMBOL_GPL(bio_split_io_at); struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *nr_segs) diff --git a/include/linux/bio.h b/include/linux/bio.h index 27cbff5b0356..13d1df02656a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -322,8 +322,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); -int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, unsigned max_bytes); +int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes, unsigned len_align); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7709d55adc23..9efacabaa2f7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1870,6 +1870,13 @@ bdev_atomic_write_unit_max_bytes(struct block_device *bdev) return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); } +static inline int bio_split_rw_at(struct bio *bio, + const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes) +{ + return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 743bf2e0c49c835cb7c4e4ac7d5a2610587047be Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:52 -0700 Subject: block: add size alignment to bio_iov_iter_get_pages The block layer tries to align bio vectors to the block device's logical block size. Some cases don't have a block device, or we may need to align to something larger, which we can't derive it from the queue limits. Have the caller specify what they want, or allow any length alignment if nothing was specified. Since the most common use case relies on the block device's limits, a helper function is provided. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 19 +++++++++++-------- block/fops.c | 6 +++--- fs/iomap/direct-io.c | 2 +- include/linux/bio.h | 9 ++++++++- include/linux/blkdev.h | 7 +++++++ 5 files changed, 30 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 971d96afaf8d..f91dc9f32bdc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1208,7 +1208,8 @@ static unsigned int get_contig_folio_len(unsigned int *num_pages, * For a multi-segment *iter, this function only adds pages from the next * non-empty segment of the iov iterator. */ -static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) { iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; @@ -1217,7 +1218,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page **pages = (struct page **)bv; ssize_t size; unsigned int num_pages, i = 0; - size_t offset, folio_offset, left, len; + size_t offset, folio_offset, left, len, trim; int ret = 0; /* @@ -1246,8 +1247,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - if (bio->bi_bdev) { - size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); + trim = size & len_align_mask; + if (trim) { iov_iter_revert(iter, trim); size -= trim; } @@ -1302,9 +1303,10 @@ out: } /** - * bio_iov_iter_get_pages - add user or kernel pages to a bio + * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added + * @len_align_mask: the mask to align each vector size to, 0 for any length * * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and @@ -1321,7 +1323,8 @@ out: * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask) { int ret = 0; @@ -1337,12 +1340,12 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); do { - ret = __bio_iov_iter_get_pages(bio, iter); + ret = __bio_iov_iter_get_pages(bio, iter, len_align_mask); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); return bio->bi_vcnt ? 0 : ret; } -EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); +EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned); static void submit_bio_wait_endio(struct bio *bio) { diff --git a/block/fops.c b/block/fops.c index 82451ac8ff25..d136fb5f6b6a 100644 --- a/block/fops.c +++ b/block/fops.c @@ -78,7 +78,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; - ret = bio_iov_iter_get_pages(&bio, iter); + ret = bio_iov_iter_get_bdev_pages(&bio, iter, bdev); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; @@ -212,7 +212,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; - ret = bio_iov_iter_get_pages(bio, iter); + ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); @@ -348,7 +348,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, */ bio_iov_bvec_set(bio, iter); } else { - ret = bio_iov_iter_get_pages(bio, iter); + ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); if (unlikely(ret)) goto out_bio_put; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b84f6af2eb4c..fea23fa6a402 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -434,7 +434,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - ret = bio_iov_iter_get_pages(bio, dio->submit.iter); + ret = bio_iov_iter_get_bdev_pages(bio, dio->submit.iter, iomap->bdev); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall diff --git a/include/linux/bio.h b/include/linux/bio.h index 13d1df02656a..a64a30131031 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -446,7 +446,14 @@ int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); +int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, + unsigned len_align_mask); + +static inline int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + return bio_iov_iter_get_pages_aligned(bio, iter, 0); +} + void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9efacabaa2f7..44e1066f7446 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1877,6 +1877,13 @@ static inline int bio_split_rw_at(struct bio *bio, return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); } +static inline int bio_iov_iter_get_bdev_pages(struct bio *bio, + struct iov_iter *iter, struct block_device *bdev) +{ + return bio_iov_iter_get_pages_aligned(bio, iter, + bdev_logical_block_size(bdev) - 1); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 9eab1d4e0d15b633adc170c458c51e8be3b1c553 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:56 -0700 Subject: block: remove bdev_iter_is_aligned No more callers. Signed-off-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 44e1066f7446..c8cb08b2ed29 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1590,13 +1590,6 @@ static inline unsigned int bdev_dma_alignment(struct block_device *bdev) return queue_dma_alignment(bdev_get_queue(bdev)); } -static inline bool bdev_iter_is_aligned(struct block_device *bdev, - struct iov_iter *iter) -{ - return iov_iter_is_aligned(iter, bdev_dma_alignment(bdev), - bdev_logical_block_size(bdev) - 1); -} - static inline unsigned int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { -- cgit v1.2.3 From b475272f03ca5d0c437c8f899ff229b21010ec83 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 27 Aug 2025 07:12:58 -0700 Subject: iov_iter: remove iov_iter_is_aligned No more callers. Signed-off-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/uio.h | 2 -- lib/iov_iter.c | 95 ----------------------------------------------------- 2 files changed, 97 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 2e86c653186c..5b127043a151 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -286,8 +286,6 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); -bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f9193f952f49..2fe66a6b8789 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -784,101 +784,6 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) } EXPORT_SYMBOL(iov_iter_discard); -static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - const struct iovec *iov = iter_iov(i); - size_t size = i->count; - size_t skip = i->iov_offset; - - do { - size_t len = iov->iov_len - skip; - - if (len > size) - len = size; - if (len & len_mask) - return false; - if ((unsigned long)(iov->iov_base + skip) & addr_mask) - return false; - - iov++; - size -= len; - skip = 0; - } while (size); - - return true; -} - -static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - const struct bio_vec *bvec = i->bvec; - unsigned skip = i->iov_offset; - size_t size = i->count; - - do { - size_t len = bvec->bv_len - skip; - - if (len > size) - len = size; - if (len & len_mask) - return false; - if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) - return false; - - bvec++; - size -= len; - skip = 0; - } while (size); - - return true; -} - -/** - * iov_iter_is_aligned() - Check if the addresses and lengths of each segments - * are aligned to the parameters. - * - * @i: &struct iov_iter to restore - * @addr_mask: bit mask to check against the iov element's addresses - * @len_mask: bit mask to check against the iov element's lengths - * - * Return: false if any addresses or lengths intersect with the provided masks - */ -bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, - unsigned len_mask) -{ - if (likely(iter_is_ubuf(i))) { - if (i->count & len_mask) - return false; - if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) - return false; - return true; - } - - if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) - return iov_iter_aligned_iovec(i, addr_mask, len_mask); - - if (iov_iter_is_bvec(i)) - return iov_iter_aligned_bvec(i, addr_mask, len_mask); - - /* With both xarray and folioq types, we're dealing with whole folios. */ - if (iov_iter_is_xarray(i)) { - if (i->count & len_mask) - return false; - if ((i->xarray_start + i->iov_offset) & addr_mask) - return false; - } - if (iov_iter_is_folioq(i)) { - if (i->count & len_mask) - return false; - if (i->iov_offset & addr_mask) - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(iov_iter_is_aligned); - static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { const struct iovec *iov = iter_iov(i); -- cgit v1.2.3 From d57447ffb5fadffdba920f2fb933296fb6c5ff57 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 3 Sep 2025 12:33:17 -0700 Subject: blk-mq-dma: bring back p2p request flags We only need to consider data and metadata dma mapping types separately. The request and bio integrity payload have enough flag bits to internally track the mapping type for each. Use these so the caller doesn't need to track them, and provide separete request and integrity helpers to the common code. This will make it easier to scale new mappings, like the proposed MMIO attribute, without burdening the caller to track such things. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Leon Romanovsky Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 4 ++++ drivers/nvme/host/pci.c | 21 ++++----------------- include/linux/bio-integrity.h | 1 + include/linux/blk-integrity.h | 15 +++++++++++++++ include/linux/blk-mq-dma.h | 11 +++++++++-- include/linux/blk_types.h | 2 ++ 6 files changed, 35 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index 660b5e200ccf..449950029872 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -174,6 +174,10 @@ static bool blk_dma_map_iter_start(struct request *req, struct device *dma_dev, switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, phys_to_page(vec.paddr))) { case PCI_P2PDMA_MAP_BUS_ADDR: + if (iter->iter.is_integrity) + bio_integrity(req->bio)->bip_flags |= BIP_P2P_DMA; + else + req->cmd_flags |= REQ_P2PDMA; return blk_dma_map_bus(iter, &vec); case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d8a9dee55de3..28e203b894eb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -260,12 +260,6 @@ enum nvme_iod_flags { /* single segment dma mapping */ IOD_SINGLE_SEGMENT = 1U << 2, - /* DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ - IOD_P2P_BUS_ADDR = 1U << 3, - - /* Metadata DMA mapped with PCI_P2PDMA_MAP_BUS_ADDR */ - IOD_META_P2P_BUS_ADDR = 1U << 4, - /* Metadata using non-coalesced MPTR */ IOD_SINGLE_META_SEGMENT = 1U << 5, }; @@ -737,9 +731,8 @@ static void nvme_unmap_metadata(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->meta_dma_state, - iod->meta_total_len, - iod->flags & IOD_META_P2P_BUS_ADDR)) { + if (!blk_rq_integrity_dma_unmap(req, dma_dev, &iod->meta_dma_state, + iod->meta_total_len)) { if (nvme_pci_cmd_use_meta_sgl(&iod->cmd)) nvme_free_sgls(req, sge, &sge[1]); else @@ -766,8 +759,7 @@ static void nvme_unmap_data(struct request *req) return; } - if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len, - iod->flags & IOD_P2P_BUS_ADDR)) { + if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) { if (nvme_pci_cmd_use_sgl(&iod->cmd)) nvme_free_sgls(req, iod->descriptors[0], &iod->cmd.common.dptr.sgl); @@ -1043,9 +1035,6 @@ static blk_status_t nvme_map_data(struct request *req) if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter)) return iter.status; - if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) - iod->flags |= IOD_P2P_BUS_ADDR; - if (use_sgl == SGL_FORCED || (use_sgl == SGL_SUPPORTED && (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold))) @@ -1068,9 +1057,7 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct request *req) &iod->meta_dma_state, &iter)) return iter.status; - if (iter.p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) - iod->flags |= IOD_META_P2P_BUS_ADDR; - else if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) + if (blk_rq_dma_map_coalesce(&iod->meta_dma_state)) entries = 1; /* diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 0a25716820fe..851254f36eb3 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -13,6 +13,7 @@ enum bip_flags { BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ + BIP_P2P_DMA = 1 << 8, /* using P2P address */ }; struct bio_integrity_payload { diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 78fe2459e661..b659373788f6 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -27,6 +27,15 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); + +static inline bool blk_rq_integrity_dma_unmap(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + size_t mapped_len) +{ + return blk_dma_unmap(req, dma_dev, state, mapped_len, + bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); +} + int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); @@ -115,6 +124,12 @@ static inline int blk_rq_map_integrity_sg(struct request *q, { return 0; } +static inline bool blk_rq_integrity_dma_unmap(struct request *req, + struct device *dma_dev, struct dma_iova_state *state, + size_t mapped_len) +{ + return false; +} static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 0f45ea110ca1..51829958d872 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -43,7 +43,7 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) } /** - * blk_rq_dma_unmap - try to DMA unmap a request + * blk_dma_unmap - try to DMA unmap a request * @req: request to unmap * @dma_dev: device to unmap from * @state: DMA IOVA state @@ -53,7 +53,7 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ -static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, +static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, struct dma_iova_state *state, size_t mapped_len, bool is_p2p) { if (is_p2p) @@ -68,4 +68,11 @@ static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, return !dma_need_unmap(dma_dev); } +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len) +{ + return blk_dma_unmap(req, dma_dev, state, mapped_len, + req->cmd_flags & REQ_P2PDMA); +} + #endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index bbb7893e0542..4bd098fd61cb 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -384,6 +384,7 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ + __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -416,6 +417,7 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) +#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) -- cgit v1.2.3 From bf2da4799fdb6eb58d9c9541b7dc1096c260499d Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 6 Sep 2025 18:29:44 -0700 Subject: net/mlx5: Implement cqe_compress_type via devlink params Selects which algorithm should be used by the NIC in order to decide rate of CQE compression dependeng on PCIe bus conditions. Supported values: 1) balanced, merges fewer CQEs, resulting in a moderate compression ratio but maintaining a balance between bandwidth savings and performance 2) aggressive, merges more CQEs into a single entry, achieving a higher compression rate and maximizing performance, particularly under high traffic loads. Signed-off-by: Saeed Mahameed Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250907012953.301746-3-saeed@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/mlx5.rst | 10 + drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 8 + drivers/net/ethernet/mellanox/mlx5/core/devlink.h | 1 + .../net/ethernet/mellanox/mlx5/core/lib/nv_param.c | 245 +++++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/lib/nv_param.h | 14 ++ include/linux/mlx5/driver.h | 1 + 7 files changed, 280 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h (limited to 'include/linux') diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst index 7febe0aecd53..2edc842b620d 100644 --- a/Documentation/networking/devlink/mlx5.rst +++ b/Documentation/networking/devlink/mlx5.rst @@ -117,6 +117,16 @@ parameters. - driverinit - Control the size (in packets) of the hairpin queues. + * - ``cqe_compress_type`` + - string + - permanent + - Configure which mechanism/algorithm should be used by the NIC that will + affect the rate (aggressiveness) of compressed CQEs depending on PCIe bus + conditions and other internal NIC factors. This mode affects all queues + that enable compression. + * ``balanced`` : Merges fewer CQEs, resulting in a moderate compression ratio but maintaining a balance between bandwidth savings and performance + * ``aggressive`` : Merges more CQEs into a single entry, achieving a higher compression rate and maximizing performance, particularly under high traffic loads + The ``mlx5`` driver supports reloading via ``DEVLINK_CMD_RELOAD`` Info versions diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index a65ab661375a..d77696f46eb5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -17,7 +17,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \ diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \ - fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o + fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o lib/nv_param.o # # Netdev basic diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 2c0e0c16ca90..326d438e75b5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -10,6 +10,7 @@ #include "esw/qos.h" #include "sf/dev/dev.h" #include "sf/sf.h" +#include "lib/nv_param.h" static int mlx5_devlink_flash_update(struct devlink *devlink, struct devlink_flash_update_params *params, @@ -895,8 +896,14 @@ int mlx5_devlink_params_register(struct devlink *devlink) if (err) goto max_uc_list_err; + err = mlx5_nv_param_register_dl_params(devlink); + if (err) + goto nv_param_err; + return 0; +nv_param_err: + mlx5_devlink_max_uc_list_params_unregister(devlink); max_uc_list_err: mlx5_devlink_auxdev_params_unregister(devlink); auxdev_reg_err: @@ -907,6 +914,7 @@ auxdev_reg_err: void mlx5_devlink_params_unregister(struct devlink *devlink) { + mlx5_nv_param_unregister_dl_params(devlink); mlx5_devlink_max_uc_list_params_unregister(devlink); mlx5_devlink_auxdev_params_unregister(devlink); devl_params_unregister(devlink, mlx5_devlink_params, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h index 961f75da6227..74bcdfa70361 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h @@ -22,6 +22,7 @@ enum mlx5_devlink_param_id { MLX5_DEVLINK_PARAM_ID_ESW_MULTIPORT, MLX5_DEVLINK_PARAM_ID_HAIRPIN_NUM_QUEUES, MLX5_DEVLINK_PARAM_ID_HAIRPIN_QUEUE_SIZE, + MLX5_DEVLINK_PARAM_ID_CQE_COMPRESSION_TYPE }; struct mlx5_trap_ctx { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c new file mode 100644 index 000000000000..20a39483be04 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "nv_param.h" +#include "mlx5_core.h" + +enum { + MLX5_CLASS_0_CTRL_ID_NV_SW_OFFLOAD_CONFIG = 0x10a, +}; + +struct mlx5_ifc_configuration_item_type_class_global_bits { + u8 type_class[0x8]; + u8 parameter_index[0x18]; +}; + +union mlx5_ifc_config_item_type_auto_bits { + struct mlx5_ifc_configuration_item_type_class_global_bits + configuration_item_type_class_global; + u8 reserved_at_0[0x20]; +}; + +struct mlx5_ifc_config_item_bits { + u8 valid[0x2]; + u8 priority[0x2]; + u8 header_type[0x2]; + u8 ovr_en[0x1]; + u8 rd_en[0x1]; + u8 access_mode[0x2]; + u8 reserved_at_a[0x1]; + u8 writer_id[0x5]; + u8 version[0x4]; + u8 reserved_at_14[0x2]; + u8 host_id_valid[0x1]; + u8 length[0x9]; + + union mlx5_ifc_config_item_type_auto_bits type; + + u8 reserved_at_40[0x10]; + u8 crc16[0x10]; +}; + +struct mlx5_ifc_mnvda_reg_bits { + struct mlx5_ifc_config_item_bits configuration_item_header; + + u8 configuration_item_data[64][0x20]; +}; + +struct mlx5_ifc_nv_sw_offload_conf_bits { + u8 ip_over_vxlan_port[0x10]; + u8 tunnel_ecn_copy_offload_disable[0x1]; + u8 pci_atomic_mode[0x3]; + u8 sr_enable[0x1]; + u8 ptp_cyc2realtime[0x1]; + u8 vector_calc_disable[0x1]; + u8 uctx_en[0x1]; + u8 prio_tag_required_en[0x1]; + u8 esw_fdb_ipv4_ttl_modify_enable[0x1]; + u8 mkey_by_name[0x1]; + u8 ip_over_vxlan_en[0x1]; + u8 one_qp_per_recovery[0x1]; + u8 cqe_compression[0x3]; + u8 tunnel_udp_entropy_proto_disable[0x1]; + u8 reserved_at_21[0x1]; + u8 ar_enable[0x1]; + u8 log_max_outstanding_wqe[0x5]; + u8 vf_migration[0x2]; + u8 log_tx_psn_win[0x6]; + u8 lro_log_timeout3[0x4]; + u8 lro_log_timeout2[0x4]; + u8 lro_log_timeout1[0x4]; + u8 lro_log_timeout0[0x4]; +}; + +#define MNVDA_HDR_SZ \ + (MLX5_ST_SZ_BYTES(mnvda_reg) - \ + MLX5_BYTE_OFF(mnvda_reg, configuration_item_data)) + +#define MLX5_SET_CFG_ITEM_TYPE(_cls_name, _mnvda_ptr, _field, _val) \ + MLX5_SET(mnvda_reg, _mnvda_ptr, \ + configuration_item_header.type.configuration_item_type_class_##_cls_name._field, \ + _val) + +#define MLX5_SET_CFG_HDR_LEN(_mnvda_ptr, _cls_name) \ + MLX5_SET(mnvda_reg, _mnvda_ptr, configuration_item_header.length, \ + MLX5_ST_SZ_BYTES(_cls_name)) + +#define MLX5_GET_CFG_HDR_LEN(_mnvda_ptr) \ + MLX5_GET(mnvda_reg, _mnvda_ptr, configuration_item_header.length) + +static int mlx5_nv_param_read(struct mlx5_core_dev *dev, void *mnvda, + size_t len) +{ + u32 param_idx, type_class; + u32 header_len; + void *cls_ptr; + int err; + + if (WARN_ON(len > MLX5_ST_SZ_BYTES(mnvda_reg)) || len < MNVDA_HDR_SZ) + return -EINVAL; /* A caller bug */ + + err = mlx5_core_access_reg(dev, mnvda, len, mnvda, len, MLX5_REG_MNVDA, + 0, 0); + if (!err) + return 0; + + cls_ptr = MLX5_ADDR_OF(mnvda_reg, mnvda, + configuration_item_header.type.configuration_item_type_class_global); + + type_class = MLX5_GET(configuration_item_type_class_global, cls_ptr, + type_class); + param_idx = MLX5_GET(configuration_item_type_class_global, cls_ptr, + parameter_index); + header_len = MLX5_GET_CFG_HDR_LEN(mnvda); + + mlx5_core_warn(dev, "Failed to read mnvda reg: type_class 0x%x, param_idx 0x%x, header_len %u, err %d\n", + type_class, param_idx, header_len, err); + + return -EOPNOTSUPP; +} + +static int mlx5_nv_param_write(struct mlx5_core_dev *dev, void *mnvda, + size_t len) +{ + if (WARN_ON(len > MLX5_ST_SZ_BYTES(mnvda_reg)) || len < MNVDA_HDR_SZ) + return -EINVAL; + + if (WARN_ON(MLX5_GET_CFG_HDR_LEN(mnvda) == 0)) + return -EINVAL; + + return mlx5_core_access_reg(dev, mnvda, len, mnvda, len, MLX5_REG_MNVDA, + 0, 1); +} + +static int +mlx5_nv_param_read_sw_offload_conf(struct mlx5_core_dev *dev, void *mnvda, + size_t len) +{ + MLX5_SET_CFG_ITEM_TYPE(global, mnvda, type_class, 0); + MLX5_SET_CFG_ITEM_TYPE(global, mnvda, parameter_index, + MLX5_CLASS_0_CTRL_ID_NV_SW_OFFLOAD_CONFIG); + MLX5_SET_CFG_HDR_LEN(mnvda, nv_sw_offload_conf); + + return mlx5_nv_param_read(dev, mnvda, len); +} + +static const char *const + cqe_compress_str[] = { "balanced", "aggressive" }; + +static int +mlx5_nv_param_devlink_cqe_compress_get(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {}; + u8 value = U8_MAX; + void *data; + int err; + + err = mlx5_nv_param_read_sw_offload_conf(dev, mnvda, sizeof(mnvda)); + if (err) + return err; + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + value = MLX5_GET(nv_sw_offload_conf, data, cqe_compression); + + if (value >= ARRAY_SIZE(cqe_compress_str)) + return -EOPNOTSUPP; + + strscpy(ctx->val.vstr, cqe_compress_str[value], sizeof(ctx->val.vstr)); + return 0; +} + +static int +mlx5_nv_param_devlink_cqe_compress_validate(struct devlink *devlink, u32 id, + union devlink_param_value val, + struct netlink_ext_ack *extack) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(cqe_compress_str); i++) { + if (!strcmp(val.vstr, cqe_compress_str[i])) + return 0; + } + + NL_SET_ERR_MSG_MOD(extack, + "Invalid value, supported values are balanced/aggressive"); + return -EOPNOTSUPP; +} + +static int +mlx5_nv_param_devlink_cqe_compress_set(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) +{ + struct mlx5_core_dev *dev = devlink_priv(devlink); + u32 mnvda[MLX5_ST_SZ_DW(mnvda_reg)] = {}; + int err = 0; + void *data; + u8 value; + + if (!strcmp(ctx->val.vstr, "aggressive")) + value = 1; + else /* balanced: can't be anything else already validated above */ + value = 0; + + err = mlx5_nv_param_read_sw_offload_conf(dev, mnvda, sizeof(mnvda)); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to read sw_offload_conf mnvda reg"); + return err; + } + + data = MLX5_ADDR_OF(mnvda_reg, mnvda, configuration_item_data); + MLX5_SET(nv_sw_offload_conf, data, cqe_compression, value); + + return mlx5_nv_param_write(dev, mnvda, sizeof(mnvda)); +} + +static const struct devlink_param mlx5_nv_param_devlink_params[] = { + DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_CQE_COMPRESSION_TYPE, + "cqe_compress_type", DEVLINK_PARAM_TYPE_STRING, + BIT(DEVLINK_PARAM_CMODE_PERMANENT), + mlx5_nv_param_devlink_cqe_compress_get, + mlx5_nv_param_devlink_cqe_compress_set, + mlx5_nv_param_devlink_cqe_compress_validate), +}; + +int mlx5_nv_param_register_dl_params(struct devlink *devlink) +{ + if (!mlx5_core_is_pf(devlink_priv(devlink))) + return 0; + + return devl_params_register(devlink, mlx5_nv_param_devlink_params, + ARRAY_SIZE(mlx5_nv_param_devlink_params)); +} + +void mlx5_nv_param_unregister_dl_params(struct devlink *devlink) +{ + if (!mlx5_core_is_pf(devlink_priv(devlink))) + return; + + devl_params_unregister(devlink, mlx5_nv_param_devlink_params, + ARRAY_SIZE(mlx5_nv_param_devlink_params)); +} + diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h new file mode 100644 index 000000000000..9f4922ff7745 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/nv_param.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_NV_PARAM_H +#define __MLX5_NV_PARAM_H + +#include +#include "devlink.h" + +int mlx5_nv_param_register_dl_params(struct devlink *devlink); +void mlx5_nv_param_unregister_dl_params(struct devlink *devlink); + +#endif + diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c0858af0e854..fcfc18bfeba9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -137,6 +137,7 @@ enum { MLX5_REG_MTCAP = 0x9009, MLX5_REG_MTMP = 0x900A, MLX5_REG_MCIA = 0x9014, + MLX5_REG_MNVDA = 0x9024, MLX5_REG_MFRL = 0x9028, MLX5_REG_MLCR = 0x902b, MLX5_REG_MRTC = 0x902d, -- cgit v1.2.3 From e096a7cc0be126d9376e549a10d71cf16b1a1c1c Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 5 Sep 2025 11:07:09 +0800 Subject: ptp: add debugfs interfaces to loop back the periodic output signal For some PTP devices, they have the capability to loop back the periodic output signal for debugging, such as the ptp_qoriq device. So add the generic interfaces to set the periodic output signal loopback, rather than each vendor having a different implementation. Show how many channels support the periodic output signal loopback: $ cat /sys/kernel/debug/ptp/n_perout_loopback Enable the loopback of the periodic output signal of channel X: $ echo 1 > /sys/kernel/debug/ptp/perout_loopback Disable the loopback of the periodic output signal of channel X: $ echo 0 > /sys/kernel/debug/ptp/perout_loopback Suggested-by: Andrew Lunn Signed-off-by: Wei Fang Link: https://patch.msgid.link/20250905030711.1509648-2-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_clock.c | 69 ++++++++++++++++++++++++++++++++++++++++ include/linux/ptp_clock_kernel.h | 10 ++++++ 2 files changed, 79 insertions(+) (limited to 'include/linux') diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 5739a57958c7..1d920f8e20a8 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -248,6 +248,69 @@ static void ptp_aux_kworker(struct kthread_work *work) kthread_queue_delayed_work(ptp->kworker, &ptp->aux_work, delay); } +static ssize_t ptp_n_perout_loopback_read(struct file *filep, + char __user *buffer, + size_t count, loff_t *pos) +{ + struct ptp_clock *ptp = filep->private_data; + char buf[12] = {}; + + snprintf(buf, sizeof(buf), "%d\n", ptp->info->n_per_lp); + + return simple_read_from_buffer(buffer, count, pos, buf, strlen(buf)); +} + +static const struct file_operations ptp_n_perout_loopback_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = ptp_n_perout_loopback_read, +}; + +static ssize_t ptp_perout_loopback_write(struct file *filep, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct ptp_clock *ptp = filep->private_data; + struct ptp_clock_info *ops = ptp->info; + unsigned int index, enable; + int len, cnt, err; + char buf[32] = {}; + + if (*ppos || !count) + return -EINVAL; + + if (count >= sizeof(buf)) + return -ENOSPC; + + len = simple_write_to_buffer(buf, sizeof(buf) - 1, + ppos, buffer, count); + if (len < 0) + return len; + + buf[len] = '\0'; + cnt = sscanf(buf, "%u %u", &index, &enable); + if (cnt != 2) + return -EINVAL; + + if (index >= ops->n_per_lp) + return -EINVAL; + + if (enable != 0 && enable != 1) + return -EINVAL; + + err = ops->perout_loopback(ops, index, enable); + if (err) + return err; + + return count; +} + +static const struct file_operations ptp_perout_loopback_ops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = ptp_perout_loopback_write, +}; + /* public interface */ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, @@ -389,6 +452,12 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, /* Debugfs initialization */ snprintf(debugfsname, sizeof(debugfsname), "ptp%d", ptp->index); ptp->debugfs_root = debugfs_create_dir(debugfsname, NULL); + if (info->n_per_lp > 0 && info->perout_loopback) { + debugfs_create_file("n_perout_loopback", 0400, ptp->debugfs_root, + ptp, &ptp_n_perout_loopback_fops); + debugfs_create_file("perout_loopback", 0200, ptp->debugfs_root, + ptp, &ptp_perout_loopback_ops); + } return ptp; diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 7dd7951b23d5..884364596dd3 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -67,6 +67,8 @@ struct ptp_system_timestamp { * @n_ext_ts: The number of external time stamp channels. * @n_per_out: The number of programmable periodic signals. * @n_pins: The number of programmable pins. + * @n_per_lp: The number of channels that support loopback the periodic + * output signal. * @pps: Indicates whether the clock supports a PPS callback. * * @supported_perout_flags: The set of flags the driver supports for the @@ -175,6 +177,11 @@ struct ptp_system_timestamp { * scheduling time (>=0) or negative value in case further * scheduling is not required. * + * @perout_loopback: Request driver to enable or disable the periodic output + * signal loopback. + * parameter index: index of the periodic output signal channel. + * parameter on: caller passes one to enable or zero to disable. + * * Drivers should embed their ptp_clock_info within a private * structure, obtaining a reference to it using container_of(). * @@ -189,6 +196,7 @@ struct ptp_clock_info { int n_ext_ts; int n_per_out; int n_pins; + int n_per_lp; int pps; unsigned int supported_perout_flags; unsigned int supported_extts_flags; @@ -213,6 +221,8 @@ struct ptp_clock_info { int (*verify)(struct ptp_clock_info *ptp, unsigned int pin, enum ptp_pin_function func, unsigned int chan); long (*do_aux_work)(struct ptp_clock_info *ptp); + int (*perout_loopback)(struct ptp_clock_info *ptp, unsigned int index, + int on); }; struct ptp_clock; -- cgit v1.2.3 From f3164840a136b123c8348ca5af4d83d99aa86eb7 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Fri, 5 Sep 2025 11:07:11 +0800 Subject: ptp: qoriq: convert to use generic interfaces to set loopback mode Since the generic debugfs interfaces for setting the periodic pulse signal loopback have been added to the ptp_clock driver, so convert the vendor-defined debugfs interfaces to the generic interfaces. Signed-off-by: Wei Fang Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Link: https://patch.msgid.link/20250905030711.1509648-4-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - drivers/ptp/Kconfig | 2 +- drivers/ptp/Makefile | 4 +- drivers/ptp/ptp_qoriq.c | 24 +++++++++- drivers/ptp/ptp_qoriq_debugfs.c | 101 ---------------------------------------- include/linux/fsl/ptp_qoriq.h | 10 ---- 6 files changed, 24 insertions(+), 118 deletions(-) delete mode 100644 drivers/ptp/ptp_qoriq_debugfs.c (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index b81595e9ea95..e3907f0c1243 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9817,7 +9817,6 @@ F: drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp* F: drivers/net/ethernet/freescale/dpaa2/dprtc* F: drivers/net/ethernet/freescale/enetc/enetc_ptp.c F: drivers/ptp/ptp_qoriq.c -F: drivers/ptp/ptp_qoriq_debugfs.c F: include/linux/fsl/ptp_qoriq.h FREESCALE QUAD SPI DRIVER diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 9256bf2e8ad4..5f8ea34d11d6 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -67,7 +67,7 @@ config PTP_1588_CLOCK_QORIQ packets using the SO_TIMESTAMPING API. To compile this driver as a module, choose M here: the module - will be called ptp-qoriq. + will be called ptp_qoriq. comment "Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks." depends on PHYLIB=n || NETWORK_PHY_TIMESTAMPING=n diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile index 8985d723d29c..bdc47e284f14 100644 --- a/drivers/ptp/Makefile +++ b/drivers/ptp/Makefile @@ -12,9 +12,7 @@ obj-$(CONFIG_PTP_1588_CLOCK_INES) += ptp_ines.o obj-$(CONFIG_PTP_1588_CLOCK_PCH) += ptp_pch.o obj-$(CONFIG_PTP_1588_CLOCK_KVM) += ptp_kvm.o obj-$(CONFIG_PTP_1588_CLOCK_VMCLOCK) += ptp_vmclock.o -obj-$(CONFIG_PTP_1588_CLOCK_QORIQ) += ptp-qoriq.o -ptp-qoriq-y += ptp_qoriq.o -ptp-qoriq-$(CONFIG_DEBUG_FS) += ptp_qoriq_debugfs.o +obj-$(CONFIG_PTP_1588_CLOCK_QORIQ) += ptp_qoriq.o obj-$(CONFIG_PTP_1588_CLOCK_IDTCM) += ptp_clockmatrix.o obj-$(CONFIG_PTP_1588_CLOCK_FC3W) += ptp_fc3.o obj-$(CONFIG_PTP_1588_CLOCK_IDT82P33) += ptp_idt82p33.o diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index 4d488c1f1941..8da995e36aeb 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -465,6 +465,25 @@ static int ptp_qoriq_auto_config(struct ptp_qoriq *ptp_qoriq, return 0; } +static int ptp_qoriq_perout_loopback(struct ptp_clock_info *ptp, + unsigned int index, int on) +{ + struct ptp_qoriq *ptp_qoriq = container_of(ptp, struct ptp_qoriq, caps); + struct ptp_qoriq_registers *regs = &ptp_qoriq->regs; + u32 loopback_bit = index ? PP2L : PP1L; + u32 tmr_ctrl; + + tmr_ctrl = ptp_qoriq->read(®s->ctrl_regs->tmr_ctrl); + if (on) + tmr_ctrl |= loopback_bit; + else + tmr_ctrl &= ~loopback_bit; + + ptp_qoriq->write(®s->ctrl_regs->tmr_ctrl, tmr_ctrl); + + return 0; +} + int ptp_qoriq_init(struct ptp_qoriq *ptp_qoriq, void __iomem *base, const struct ptp_clock_info *caps) { @@ -479,6 +498,8 @@ int ptp_qoriq_init(struct ptp_qoriq *ptp_qoriq, void __iomem *base, ptp_qoriq->base = base; ptp_qoriq->caps = *caps; + ptp_qoriq->caps.n_per_lp = 2; + ptp_qoriq->caps.perout_loopback = ptp_qoriq_perout_loopback; if (of_property_read_u32(node, "fsl,cksel", &ptp_qoriq->cksel)) ptp_qoriq->cksel = DEFAULT_CKSEL; @@ -568,7 +589,7 @@ int ptp_qoriq_init(struct ptp_qoriq *ptp_qoriq, void __iomem *base, return PTR_ERR(ptp_qoriq->clock); ptp_qoriq->phc_index = ptp_clock_index(ptp_qoriq->clock); - ptp_qoriq_create_debugfs(ptp_qoriq); + return 0; } EXPORT_SYMBOL_GPL(ptp_qoriq_init); @@ -580,7 +601,6 @@ void ptp_qoriq_free(struct ptp_qoriq *ptp_qoriq) ptp_qoriq->write(®s->ctrl_regs->tmr_temask, 0); ptp_qoriq->write(®s->ctrl_regs->tmr_ctrl, 0); - ptp_qoriq_remove_debugfs(ptp_qoriq); ptp_clock_unregister(ptp_qoriq->clock); iounmap(ptp_qoriq->base); free_irq(ptp_qoriq->irq, ptp_qoriq); diff --git a/drivers/ptp/ptp_qoriq_debugfs.c b/drivers/ptp/ptp_qoriq_debugfs.c deleted file mode 100644 index e8dddcedf288..000000000000 --- a/drivers/ptp/ptp_qoriq_debugfs.c +++ /dev/null @@ -1,101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* Copyright 2019 NXP - */ -#include -#include -#include - -static int ptp_qoriq_fiper1_lpbk_get(void *data, u64 *val) -{ - struct ptp_qoriq *ptp_qoriq = data; - struct ptp_qoriq_registers *regs = &ptp_qoriq->regs; - u32 ctrl; - - ctrl = ptp_qoriq->read(®s->ctrl_regs->tmr_ctrl); - *val = ctrl & PP1L ? 1 : 0; - - return 0; -} - -static int ptp_qoriq_fiper1_lpbk_set(void *data, u64 val) -{ - struct ptp_qoriq *ptp_qoriq = data; - struct ptp_qoriq_registers *regs = &ptp_qoriq->regs; - u32 ctrl; - - ctrl = ptp_qoriq->read(®s->ctrl_regs->tmr_ctrl); - if (val == 0) - ctrl &= ~PP1L; - else - ctrl |= PP1L; - - ptp_qoriq->write(®s->ctrl_regs->tmr_ctrl, ctrl); - return 0; -} - -DEFINE_DEBUGFS_ATTRIBUTE(ptp_qoriq_fiper1_fops, ptp_qoriq_fiper1_lpbk_get, - ptp_qoriq_fiper1_lpbk_set, "%llu\n"); - -static int ptp_qoriq_fiper2_lpbk_get(void *data, u64 *val) -{ - struct ptp_qoriq *ptp_qoriq = data; - struct ptp_qoriq_registers *regs = &ptp_qoriq->regs; - u32 ctrl; - - ctrl = ptp_qoriq->read(®s->ctrl_regs->tmr_ctrl); - *val = ctrl & PP2L ? 1 : 0; - - return 0; -} - -static int ptp_qoriq_fiper2_lpbk_set(void *data, u64 val) -{ - struct ptp_qoriq *ptp_qoriq = data; - struct ptp_qoriq_registers *regs = &ptp_qoriq->regs; - u32 ctrl; - - ctrl = ptp_qoriq->read(®s->ctrl_regs->tmr_ctrl); - if (val == 0) - ctrl &= ~PP2L; - else - ctrl |= PP2L; - - ptp_qoriq->write(®s->ctrl_regs->tmr_ctrl, ctrl); - return 0; -} - -DEFINE_DEBUGFS_ATTRIBUTE(ptp_qoriq_fiper2_fops, ptp_qoriq_fiper2_lpbk_get, - ptp_qoriq_fiper2_lpbk_set, "%llu\n"); - -void ptp_qoriq_create_debugfs(struct ptp_qoriq *ptp_qoriq) -{ - struct dentry *root; - - root = debugfs_create_dir(dev_name(ptp_qoriq->dev), NULL); - if (IS_ERR(root)) - return; - if (!root) - goto err_root; - - ptp_qoriq->debugfs_root = root; - - if (!debugfs_create_file_unsafe("fiper1-loopback", 0600, root, - ptp_qoriq, &ptp_qoriq_fiper1_fops)) - goto err_node; - if (!debugfs_create_file_unsafe("fiper2-loopback", 0600, root, - ptp_qoriq, &ptp_qoriq_fiper2_fops)) - goto err_node; - return; - -err_node: - debugfs_remove_recursive(root); - ptp_qoriq->debugfs_root = NULL; -err_root: - dev_err(ptp_qoriq->dev, "failed to initialize debugfs\n"); -} - -void ptp_qoriq_remove_debugfs(struct ptp_qoriq *ptp_qoriq) -{ - debugfs_remove_recursive(ptp_qoriq->debugfs_root); - ptp_qoriq->debugfs_root = NULL; -} diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h index b301bf7199d3..3601e25779ba 100644 --- a/include/linux/fsl/ptp_qoriq.h +++ b/include/linux/fsl/ptp_qoriq.h @@ -145,7 +145,6 @@ struct ptp_qoriq { struct ptp_clock *clock; struct ptp_clock_info caps; struct resource *rsrc; - struct dentry *debugfs_root; struct device *dev; bool extts_fifo_support; bool fiper3_support; @@ -195,14 +194,5 @@ int ptp_qoriq_settime(struct ptp_clock_info *ptp, int ptp_qoriq_enable(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on); int extts_clean_up(struct ptp_qoriq *ptp_qoriq, int index, bool update_event); -#ifdef CONFIG_DEBUG_FS -void ptp_qoriq_create_debugfs(struct ptp_qoriq *ptp_qoriq); -void ptp_qoriq_remove_debugfs(struct ptp_qoriq *ptp_qoriq); -#else -static inline void ptp_qoriq_create_debugfs(struct ptp_qoriq *ptp_qoriq) -{ } -static inline void ptp_qoriq_remove_debugfs(struct ptp_qoriq *ptp_qoriq) -{ } -#endif #endif -- cgit v1.2.3 From 1733e88874838ddebf7774440c285700865e6b08 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:41 +0800 Subject: block: cleanup bio_issue Now that bio->bi_issue is only used by blk-iolatency to get bio issue time, replace bio_issue with u64 time directly and remove bio_issue to make code cleaner. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-cgroup.h | 2 +- block/blk-iolatency.c | 14 +++----------- block/blk.h | 42 ------------------------------------------ include/linux/blk_types.h | 7 ++----- 5 files changed, 7 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 9603ca3ec770..3a1a848940dd 100644 --- a/block/bio.c +++ b/block/bio.c @@ -261,7 +261,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_private = NULL; #ifdef CONFIG_BLK_CGROUP bio->bi_blkg = NULL; - bio->bi_issue.value = 0; + bio->issue_time_ns = 0; if (bdev) bio_associate_blkg(bio); #ifdef CONFIG_BLK_CGROUP_IOCOST diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 83367086cb6a..8328427e3165 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -372,7 +372,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) static inline void blkcg_bio_issue_init(struct bio *bio) { - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); + bio->issue_time_ns = blk_time_get_ns(); } static inline void blkcg_use_delay(struct blkcg_gq *blkg) diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2f8fdecdd7a9..554b191a6892 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -485,19 +485,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) mod_timer(&blkiolat->timer, jiffies + HZ); } -static void iolatency_record_time(struct iolatency_grp *iolat, - struct bio_issue *issue, u64 now, - bool issue_as_root) +static void iolatency_record_time(struct iolatency_grp *iolat, u64 start, + u64 now, bool issue_as_root) { - u64 start = bio_issue_time(issue); u64 req_time; - /* - * Have to do this so we are truncated to the correct time that our - * issue is truncated to. - */ - now = __bio_issue_time(now); - if (now <= start) return; @@ -625,7 +617,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) * submitted, so do not account for it. */ if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { - iolatency_record_time(iolat, &bio->bi_issue, now, + iolatency_record_time(iolat, bio->issue_time_ns, now, issue_as_root); window_start = atomic64_read(&iolat->window_start); if (now > window_start && diff --git a/block/blk.h b/block/blk.h index 7d420c247d81..41397688b941 100644 --- a/block/blk.h +++ b/block/blk.h @@ -681,48 +681,6 @@ static inline ktime_t blk_time_get(void) return ns_to_ktime(blk_time_get_ns()); } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 4bd098fd61cb..8e8d1cc8b06c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -198,10 +198,6 @@ static inline bool blk_path_error(blk_status_t error) return true; } -struct bio_issue { - u64 value; -}; - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; @@ -242,7 +238,8 @@ struct bio { * on release of the bio. */ struct blkcg_gq *bi_blkg; - struct bio_issue bi_issue; + /* Time that this bio was issued. */ + u64 issue_time_ns; #ifdef CONFIG_BLK_CGROUP_IOCOST u64 bi_iocost_cost; #endif -- cgit v1.2.3 From ea3d1f104db60f9d5074b33819ccea3c216e0bee Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:43 +0800 Subject: blk-mq: add QUEUE_FLAG_BIO_ISSUE_TIME bio->issue_time_ns is initialized for every bio, however, it's only used by blk-iolatency. Add a new queue_flag and only set this flag when blk-iolatency is enabled, so that extra blk_time_get_ns() can be saved for disks that blk-iolatency is not enabled. Signed-off-by: Yu Kuai Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 5 +++++ block/blk-mq-debugfs.c | 1 + block/blk-mq.c | 8 +++++--- include/linux/blkdev.h | 1 + 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 554b191a6892..45bd18f68541 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -742,10 +742,15 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) */ enabled = atomic_read(&blkiolat->enable_cnt); if (enabled != blkiolat->enabled) { + struct request_queue *q = blkiolat->rqos.disk->queue; unsigned int memflags; memflags = blk_mq_freeze_queue(blkiolat->rqos.disk->queue); blkiolat->enabled = enabled; + if (enabled) + blk_queue_flag_set(QUEUE_FLAG_BIO_ISSUE_TIME, q); + else + blk_queue_flag_clear(QUEUE_FLAG_BIO_ISSUE_TIME, q); blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue, memflags); } } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 32c65efdda46..4896525b1c05 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -96,6 +96,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(DISABLE_WBT_DEF), QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), + QUEUE_FLAG_NAME(BIO_ISSUE_TIME), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c0c8ead7080..31cc743ffad7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -396,10 +396,12 @@ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) #endif } -static inline void blk_mq_bio_issue_init(struct bio *bio) +static inline void blk_mq_bio_issue_init(struct request_queue *q, + struct bio *bio) { #ifdef CONFIG_BLK_CGROUP - bio->issue_time_ns = blk_time_get_ns(); + if (test_bit(QUEUE_FLAG_BIO_ISSUE_TIME, &q->queue_flags)) + bio->issue_time_ns = blk_time_get_ns(); #endif } @@ -3175,7 +3177,7 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) goto queue_exit; - blk_mq_bio_issue_init(bio); + blk_mq_bio_issue_init(q, bio); if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c8cb08b2ed29..7c542b1851fa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -657,6 +657,7 @@ enum { QUEUE_FLAG_DISABLE_WBT_DEF, /* for sched to disable/enable wbt */ QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ + QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ QUEUE_FLAG_MAX }; -- cgit v1.2.3 From e37b5596a19be9a150cb194ec32e78f295a3574b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 10 Sep 2025 14:30:46 +0800 Subject: block: factor out a helper bio_submit_split_bioset() No functional changes are intended, some drivers like mdraid will split bio by internal processing, prepare to unify bio split codes. Signed-off-by: Yu Kuai Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 59 ++++++++++++++++++++++++++++++++++---------------- include/linux/blkdev.h | 2 ++ 2 files changed, 42 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index 354a3070e6a1..f680ccf81864 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -104,33 +104,54 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } +/* + * bio_submit_split_bioset - Submit a bio, splitting it at a designated sector + * @bio: the original bio to be submitted and split + * @split_sectors: the sector count at which to split + * @bs: the bio set used for allocating the new split bio + * + * The original bio is modified to contain the remaining sectors and submitted. + * The caller is responsible for submitting the returned bio. + * + * If succeed, the newly allocated bio representing the initial part will be + * returned, on failure NULL will be returned and original bio will fail. + */ +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs) +{ + struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return NULL; + } + + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); + submit_bio_noacct(bio); + + return split; +} +EXPORT_SYMBOL_GPL(bio_submit_split_bioset); + static struct bio *bio_submit_split(struct bio *bio, int split_sectors) { - if (unlikely(split_sectors < 0)) - goto error; + if (unlikely(split_sectors < 0)) { + bio->bi_status = errno_to_blk_status(split_sectors); + bio_endio(bio); + return NULL; + } if (split_sectors) { - struct bio *split; - - split = bio_split(bio, split_sectors, GFP_NOIO, + bio = bio_submit_split_bioset(bio, split_sectors, &bio->bi_bdev->bd_disk->bio_split); - if (IS_ERR(split)) { - split_sectors = PTR_ERR(split); - goto error; - } - split->bi_opf |= REQ_NOMERGE; - bio_chain(split, bio); - trace_block_split(split, bio->bi_iter.bi_sector); - WARN_ON_ONCE(bio_zone_write_plugging(bio)); - submit_bio_noacct(bio); - return split; + if (bio) + bio->bi_opf |= REQ_NOMERGE; } return bio; -error: - bio->bi_status = errno_to_blk_status(split_sectors); - bio_endio(bio); - return NULL; } struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7c542b1851fa..066e5309bd45 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1000,6 +1000,8 @@ extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); void submit_bio_noacct(struct bio *bio); struct bio *bio_split_to_limits(struct bio *bio); +struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors, + struct bio_set *bs); extern int blk_lld_busy(struct request_queue *q); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); -- cgit v1.2.3 From 448097bbd3836d2ee46fa6eabd18661e9a3c8be8 Mon Sep 17 00:00:00 2001 From: Jean-François Lessard Date: Tue, 2 Sep 2025 15:04:39 -0400 Subject: device property: Add scoped fwnode child node iterators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add scoped versions of fwnode child node iterators that automatically handle reference counting cleanup using the __free() attribute: - fwnode_for_each_child_node_scoped() - fwnode_for_each_available_child_node_scoped() These macros follow the same pattern as existing scoped iterators in the kernel, ensuring fwnode references are automatically released when the iterator variable goes out of scope. This prevents resource leaks and eliminates the need for manual cleanup in error paths. The implementation mirrors the non-scoped variants but uses __free(fwnode_handle) for automatic resource management, providing a safer and more convenient interface for drivers iterating over firmware node children. Signed-off-by: Jean-François Lessard Acked-by: Danilo Krummrich Reviewed-by: Andy Shevchenko Reviewed-by: Sakari Ailus Signed-off-by: Wolfram Sang --- include/linux/property.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/property.h b/include/linux/property.h index 82f0cb3abd1e..862e208133f3 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -176,6 +176,16 @@ struct fwnode_handle *fwnode_get_next_available_child_node( for (child = fwnode_get_next_available_child_node(fwnode, NULL); child;\ child = fwnode_get_next_available_child_node(fwnode, child)) +#define fwnode_for_each_child_node_scoped(fwnode, child) \ + for (struct fwnode_handle *child __free(fwnode_handle) = \ + fwnode_get_next_child_node(fwnode, NULL); \ + child; child = fwnode_get_next_child_node(fwnode, child)) + +#define fwnode_for_each_available_child_node_scoped(fwnode, child) \ + for (struct fwnode_handle *child __free(fwnode_handle) = \ + fwnode_get_next_available_child_node(fwnode, NULL); \ + child; child = fwnode_get_next_available_child_node(fwnode, child)) + struct fwnode_handle *device_get_next_child_node(const struct device *dev, struct fwnode_handle *child); -- cgit v1.2.3 From a1ffc8ad3165fa1cf6a60c6a4b4e00dfd6603cf2 Mon Sep 17 00:00:00 2001 From: Yi Tao Date: Wed, 10 Sep 2025 14:59:33 +0800 Subject: cgroup: refactor the cgroup_attach_lock code to make it clearer Dynamic cgroup migration involving threadgroup locks can be in one of two states: no lock held, or holding the global lock. Explicitly declaring the different lock modes to make the code easier to understand and facilitates future extensions of the lock modes. Signed-off-by: Yi Tao Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 8 +++++ kernel/cgroup/cgroup-internal.h | 9 +++--- kernel/cgroup/cgroup-v1.c | 14 ++++----- kernel/cgroup/cgroup.c | 67 ++++++++++++++++++++++++++++++----------- 4 files changed, 69 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 92ed6d18266d..ff3c7d0e3e01 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -140,6 +140,14 @@ enum { __CFTYPE_ADDED = (1 << 18), }; +enum cgroup_attach_lock_mode { + /* Default */ + CGRP_ATTACH_LOCK_GLOBAL, + + /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ + CGRP_ATTACH_LOCK_NONE, +}; + /* * cgroup_file is the handle for a file instance created in a cgroup which * is used, for example, to generate file changed notifications. This can diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b14e61c64a34..a6d6f30b6f65 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,12 +249,13 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(bool lock_threadgroup); -void cgroup_attach_unlock(bool lock_threadgroup); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *locked) + enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); -void cgroup_procs_write_finish(struct task_struct *task, bool locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) __releases(&cgroup_threadgroup_rwsem); void cgroup_lock_and_drain_offline(struct cgroup *cgrp); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 0a23b65de013..852ebe7ca3a1 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -69,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); for_each_root(root) { struct cgroup *from_cgrp; @@ -81,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); cgroup_unlock(); return retval; @@ -118,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(true); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -154,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(true); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); cgroup_unlock(); return ret; } @@ -503,13 +503,13 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; - bool locked; + enum cgroup_attach_lock_mode lock_mode; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -532,7 +532,7 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0607c5d09237..0994eeaa2f69 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2482,7 +2482,7 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether to down_write cgroup_threadgroup_rwsem * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2503,21 +2503,39 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ -void cgroup_attach_lock(bool lock_threadgroup) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) { cpus_read_lock(); - if (lock_threadgroup) + + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } } /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether to up_write cgroup_threadgroup_rwsem */ -void cgroup_attach_unlock(bool lock_threadgroup) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode) { - if (lock_threadgroup) + switch (lock_mode) { + case CGRP_ATTACH_LOCK_NONE: + break; + case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); + break; + default: + pr_warn("cgroup: Unexpected attach lock mode."); + break; + } + cpus_read_unlock(); } @@ -2991,7 +3009,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, } struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, - bool *threadgroup_locked) + enum cgroup_attach_lock_mode *lock_mode) { struct task_struct *tsk; pid_t pid; @@ -3008,8 +3026,13 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, * Therefore, we can skip the global lock. */ lockdep_assert_held(&cgroup_mutex); - *threadgroup_locked = pid || threadgroup; - cgroup_attach_lock(*threadgroup_locked); + + if (pid || threadgroup) + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + *lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(*lock_mode); rcu_read_lock(); if (pid) { @@ -3040,19 +3063,20 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, goto out_unlock_rcu; out_unlock_threadgroup: - cgroup_attach_unlock(*threadgroup_locked); - *threadgroup_locked = false; + cgroup_attach_unlock(*lock_mode); + *lock_mode = CGRP_ATTACH_LOCK_NONE; out_unlock_rcu: rcu_read_unlock(); return tsk; } -void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) +void cgroup_procs_write_finish(struct task_struct *task, + enum cgroup_attach_lock_mode lock_mode) { /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - cgroup_attach_unlock(threadgroup_locked); + cgroup_attach_unlock(lock_mode); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3104,6 +3128,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + enum cgroup_attach_lock_mode lock_mode; bool has_tasks; int ret; @@ -3135,7 +3160,13 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) * write-locking can be skipped safely. */ has_tasks = !list_empty(&mgctx.preloaded_src_csets); - cgroup_attach_lock(has_tasks); + + if (has_tasks) + lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + else + lock_mode = CGRP_ATTACH_LOCK_NONE; + + cgroup_attach_lock(lock_mode); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3156,7 +3187,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(has_tasks); + cgroup_attach_unlock(lock_mode); return ret; } @@ -5279,13 +5310,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct task_struct *task; const struct cred *saved_cred; ssize_t ret; - bool threadgroup_locked; + enum cgroup_attach_lock_mode lock_mode; dst_cgrp = cgroup_kn_lock_live(of->kn, false); if (!dst_cgrp) return -ENODEV; - task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; @@ -5311,7 +5342,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, ret = cgroup_attach_task(dst_cgrp, task, threadgroup); out_finish: - cgroup_procs_write_finish(task, threadgroup_locked); + cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); -- cgit v1.2.3 From 0568f89d4fb82d98001baeb870e92f43cd1f7317 Mon Sep 17 00:00:00 2001 From: Yi Tao Date: Wed, 10 Sep 2025 14:59:35 +0800 Subject: cgroup: replace global percpu_rwsem with per threadgroup resem when writing to cgroup.procs The static usage pattern of creating a cgroup, enabling controllers, and then seeding it with CLONE_INTO_CGROUP doesn't require write locking cgroup_threadgroup_rwsem and thus doesn't benefit from this patch. To avoid affecting other users, the per threadgroup rwsem is only used when the favordynmods is enabled. As computer hardware advances, modern systems are typically equipped with many CPU cores and large amounts of memory, enabling the deployment of numerous applications. On such systems, container creation and deletion become frequent operations, making cgroup process migration no longer a cold path. This leads to noticeable contention with common process operations such as fork, exec, and exit. To alleviate the contention between cgroup process migration and operations like process fork, this patch modifies lock to take the write lock on signal_struct->group_rwsem when writing pid to cgroup.procs/threads instead of holding a global write lock. Cgroup process migration has historically relied on signal_struct->group_rwsem to protect thread group integrity. In commit <1ed1328792ff> ("sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem"), this was changed to a global cgroup_threadgroup_rwsem. The advantage of using a global lock was simplified handling of process group migrations. This patch retains the use of the global lock for protecting process group migration, while reducing contention by using per thread group lock during cgroup.procs/threads writes. The locking behavior is as follows: write cgroup.procs/threads | process fork,exec,exit | process group migration ------------------------------------------------------------------------------ cgroup_lock() | down_read(&g_rwsem) | cgroup_lock() down_write(&p_rwsem) | down_read(&p_rwsem) | down_write(&g_rwsem) critical section | critical section | critical section up_write(&p_rwsem) | up_read(&p_rwsem) | up_write(&g_rwsem) cgroup_unlock() | up_read(&g_rwsem) | cgroup_unlock() g_rwsem denotes cgroup_threadgroup_rwsem, p_rwsem denotes signal_struct->group_rwsem. This patch eliminates contention between cgroup migration and fork operations for threads that belong to different thread groups, thereby reducing the long-tail latency of cgroup migrations and lowering system load. With this patch, under heavy fork and exec interference, the long-tail latency of cgroup migration has been reduced from milliseconds to microseconds. Under heavy cgroup migration interference, the multi-CPU score of the spawn test case in UnixBench increased by 9%. tj: Update comment in cgroup_favor_dynmods() and switch WARN_ONCE() to pr_warn_once(). Signed-off-by: Yi Tao Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 17 +++++++++- include/linux/sched/signal.h | 4 +++ init/init_task.c | 3 ++ kernel/cgroup/cgroup-internal.h | 6 ++-- kernel/cgroup/cgroup-v1.c | 8 ++--- kernel/cgroup/cgroup.c | 73 ++++++++++++++++++++++++++++++++--------- kernel/fork.c | 4 +++ 7 files changed, 93 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ff3c7d0e3e01..93318fce31f3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -91,6 +91,12 @@ enum { * cgroup_threadgroup_rwsem. This makes hot path operations such as * forks and exits into the slow path and more expensive. * + * Alleviate the contention between fork, exec, exit operations and + * writing to cgroup.procs by taking a per threadgroup rwsem instead of + * the global cgroup_threadgroup_rwsem. Fork and other operations + * from threads in different thread groups no longer contend with + * writing to cgroup.procs. + * * The static usage pattern of creating a cgroup, enabling controllers, * and then seeding it with CLONE_INTO_CGROUP doesn't require write * locking cgroup_threadgroup_rwsem and thus doesn't benefit from @@ -146,6 +152,9 @@ enum cgroup_attach_lock_mode { /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ CGRP_ATTACH_LOCK_NONE, + + /* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */ + CGRP_ATTACH_LOCK_PER_THREADGROUP, }; /* @@ -846,6 +855,7 @@ struct cgroup_subsys { }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +extern bool cgroup_enable_per_threadgroup_rwsem; struct cgroup_of_peak { unsigned long value; @@ -857,11 +867,14 @@ struct cgroup_of_peak { * @tsk: target task * * Allows cgroup operations to synchronize against threadgroup changes - * using a percpu_rw_semaphore. + * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when + * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition. */ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) { percpu_down_read(&cgroup_threadgroup_rwsem); + if (cgroup_enable_per_threadgroup_rwsem) + down_read(&tsk->signal->cgroup_threadgroup_rwsem); } /** @@ -872,6 +885,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) */ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) { + if (cgroup_enable_per_threadgroup_rwsem) + up_read(&tsk->signal->cgroup_threadgroup_rwsem); percpu_up_read(&cgroup_threadgroup_rwsem); } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1ef1edbaaf79..7d6449982822 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -226,6 +226,10 @@ struct signal_struct { struct tty_audit_buf *tty_audit_buf; #endif +#ifdef CONFIG_CGROUPS + struct rw_semaphore cgroup_threadgroup_rwsem; +#endif + /* * Thread is the potential origin of an oom condition; kill first on * oom diff --git a/init/init_task.c b/init/init_task.c index e557f622bd90..a55e2189206f 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -27,6 +27,9 @@ static struct signal_struct init_signals = { }, .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, +#ifdef CONFIG_CGROUPS + .cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem), +#endif .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index a6d6f30b6f65..22051b4f1ccb 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -249,8 +249,10 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); -void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode); -void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode); +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, enum cgroup_attach_lock_mode *lock_mode) __acquires(&cgroup_threadgroup_rwsem); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 852ebe7ca3a1..a9e029b570c8 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -69,7 +69,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; cgroup_lock(); - cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); for_each_root(root) { struct cgroup *from_cgrp; @@ -81,7 +81,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return retval; @@ -118,7 +118,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_lock(); - cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); @@ -154,7 +154,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL); + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return ret; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a6b81b48bb70..fed701df1167 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -239,6 +239,14 @@ static u16 have_canfork_callback __read_mostly; static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); +/* + * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, + * read protected by either. + * + * Can only be turned on, but not turned off. + */ +bool cgroup_enable_per_threadgroup_rwsem __read_mostly; + /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { .ns.count = REFCOUNT_INIT(2), @@ -1325,14 +1333,30 @@ void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) { bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; - /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + /* + * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. + * favordynmods can flip while task is between + * cgroup_threadgroup_change_begin() and end(), so down_write global + * cgroup_threadgroup_rwsem to synchronize them. + * + * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding + * cgroup_threadgroup_rwsem doesn't exlude tasks between + * cgroup_thread_group_change_begin() and end() and thus it's unsafe to + * turn off. As the scenario is unlikely, simply disallow disabling once + * enabled and print out a warning. + */ + percpu_down_write(&cgroup_threadgroup_rwsem); if (favor && !favoring) { + cgroup_enable_per_threadgroup_rwsem = true; rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); root->flags |= CGRP_ROOT_FAVOR_DYNMODS; } else if (!favor && favoring) { + if (cgroup_enable_per_threadgroup_rwsem) + pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; } + percpu_up_write(&cgroup_threadgroup_rwsem); } static int cgroup_init_root_id(struct cgroup_root *root) @@ -2482,7 +2506,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); /** * cgroup_attach_lock - Lock for ->attach() - * @lock_mode: whether to down_write cgroup_threadgroup_rwsem + * @lock_mode: whether acquire and acquire which rwsem + * @tsk: thread group to lock * * cgroup migration sometimes needs to stabilize threadgroups against forks and * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() @@ -2502,8 +2527,15 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns); * Resolve the situation by always acquiring cpus_read_lock() before optionally * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. + * + * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead + * on dynamic cgroup modifications. see the comment above + * CGRP_ROOT_FAVOR_DYNMODS definition. + * + * tsk is not NULL only when writing to cgroup.procs. */ -void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) +void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { cpus_read_lock(); @@ -2513,6 +2545,9 @@ void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) case CGRP_ATTACH_LOCK_GLOBAL: percpu_down_write(&cgroup_threadgroup_rwsem); break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + down_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; default: pr_warn("cgroup: Unexpected attach lock mode."); break; @@ -2521,9 +2556,11 @@ void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode) /** * cgroup_attach_unlock - Undo cgroup_attach_lock() - * @lock_mode: whether to up_write cgroup_threadgroup_rwsem + * @lock_mode: whether release and release which rwsem + * @tsk: thread group to lock */ -void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode) +void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, + struct task_struct *tsk) { switch (lock_mode) { case CGRP_ATTACH_LOCK_NONE: @@ -2531,6 +2568,9 @@ void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode) case CGRP_ATTACH_LOCK_GLOBAL: percpu_up_write(&cgroup_threadgroup_rwsem); break; + case CGRP_ATTACH_LOCK_PER_THREADGROUP: + up_write(&tsk->signal->cgroup_threadgroup_rwsem); + break; default: pr_warn("cgroup: Unexpected attach lock mode."); break; @@ -3042,7 +3082,6 @@ retry_find_task: tsk = ERR_PTR(-EINVAL); goto out_unlock_rcu; } - get_task_struct(tsk); rcu_read_unlock(); @@ -3055,12 +3094,16 @@ retry_find_task: */ lockdep_assert_held(&cgroup_mutex); - if (pid || threadgroup) - *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; - else + if (pid || threadgroup) { + if (cgroup_enable_per_threadgroup_rwsem) + *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; + else + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; + } else { *lock_mode = CGRP_ATTACH_LOCK_NONE; + } - cgroup_attach_lock(*lock_mode); + cgroup_attach_lock(*lock_mode, tsk); if (threadgroup) { if (!thread_group_leader(tsk)) { @@ -3069,7 +3112,7 @@ retry_find_task: * may strip us of our leadership. If this happens, * throw this task away and try again. */ - cgroup_attach_unlock(*lock_mode); + cgroup_attach_unlock(*lock_mode, tsk); put_task_struct(tsk); goto retry_find_task; } @@ -3085,10 +3128,10 @@ out_unlock_rcu: void cgroup_procs_write_finish(struct task_struct *task, enum cgroup_attach_lock_mode lock_mode) { + cgroup_attach_unlock(lock_mode, task); + /* release reference from cgroup_procs_write_start() */ put_task_struct(task); - - cgroup_attach_unlock(lock_mode); } static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) @@ -3178,7 +3221,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) else lock_mode = CGRP_ATTACH_LOCK_NONE; - cgroup_attach_lock(lock_mode); + cgroup_attach_lock(lock_mode, NULL); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); @@ -3199,7 +3242,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - cgroup_attach_unlock(lock_mode); + cgroup_attach_unlock(lock_mode, NULL); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index c4ada32598bd..9a039867ecfd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1688,6 +1688,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->cgroup_threadgroup_rwsem); +#endif + sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; -- cgit v1.2.3 From 1dfdf4527fd391f653c53b634af9122613c58904 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Aug 2025 18:48:32 +0200 Subject: iio: adc: exynos_adc: Drop platform data support There are no Samsung Exynos SoC ADC driver users which bind via platform ID, thus platform data is never set and can be dropped. Reviewed-by: Andy Shevchenko Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250830-s3c-cleanup-adc-v2-3-4f8299343d32@linaro.org Signed-off-by: Jonathan Cameron --- drivers/iio/adc/exynos_adc.c | 11 +---------- include/linux/platform_data/touchscreen-s3c2410.h | 22 ---------------------- 2 files changed, 1 insertion(+), 32 deletions(-) delete mode 100644 include/linux/platform_data/touchscreen-s3c2410.h (limited to 'include/linux') diff --git a/drivers/iio/adc/exynos_adc.c b/drivers/iio/adc/exynos_adc.c index dd4b9921a8b5..1484adff00df 100644 --- a/drivers/iio/adc/exynos_adc.c +++ b/drivers/iio/adc/exynos_adc.c @@ -29,8 +29,6 @@ #include #include -#include - /* S3C/EXYNOS4412/5250 ADC_V1 registers definitions */ #define ADC_V1_CON(x) ((x) + 0x00) #define ADC_V1_DLY(x) ((x) + 0x08) @@ -106,7 +104,6 @@ struct exynos_adc { struct clk *clk; struct clk *sclk; unsigned int irq; - unsigned int delay; struct regulator *vdd; struct completion completion; @@ -213,7 +210,7 @@ static void exynos_adc_v1_init_hw(struct exynos_adc *info) writel(con1, ADC_V1_CON(info->regs)); /* set touchscreen delay */ - writel(info->delay, ADC_V1_DLY(info->regs)); + writel(10000, ADC_V1_DLY(info->regs)); } static void exynos_adc_v1_exit_hw(struct exynos_adc *info) @@ -556,7 +553,6 @@ static int exynos_adc_probe(struct platform_device *pdev) { struct exynos_adc *info = NULL; struct device_node *np = pdev->dev.of_node; - struct s3c2410_ts_mach_info *pdata = dev_get_platdata(&pdev->dev); struct iio_dev *indio_dev = NULL; int ret; int irq; @@ -655,11 +651,6 @@ static int exynos_adc_probe(struct platform_device *pdev) if (info->data->init_hw) info->data->init_hw(info); - if (pdata) - info->delay = pdata->delay; - else - info->delay = 10000; - ret = of_platform_populate(np, exynos_adc_match, NULL, &indio_dev->dev); if (ret < 0) { dev_err(&pdev->dev, "failed adding child nodes\n"); diff --git a/include/linux/platform_data/touchscreen-s3c2410.h b/include/linux/platform_data/touchscreen-s3c2410.h deleted file mode 100644 index bf8d3b9d7c6a..000000000000 --- a/include/linux/platform_data/touchscreen-s3c2410.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2005 Arnaud Patard -*/ - -#ifndef __TOUCHSCREEN_S3C2410_H -#define __TOUCHSCREEN_S3C2410_H - -struct s3c2410_ts_mach_info { - int delay; - int presc; - int oversampling_shift; - void (*cfg_gpio)(struct platform_device *dev); -}; - -extern void s3c24xx_ts_set_platdata(struct s3c2410_ts_mach_info *); -extern void s3c64xx_ts_set_platdata(struct s3c2410_ts_mach_info *); - -/* defined by architecture to configure gpio */ -extern void s3c24xx_ts_cfg_gpio(struct platform_device *dev); - -#endif /*__TOUCHSCREEN_S3C2410_H */ -- cgit v1.2.3 From cec1aec9c46305743a2d4a1bfb06f6b0374d5ed0 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 31 Aug 2025 12:48:22 +0200 Subject: iio: consumers: Add an iio_multiply_value() helper function The channel-scale handling in iio_convert_raw_to_processed() in essence does the following: processed = raw * caller-provided-scale * channel-scale Which can also be written as: multiplier = raw * caller-provided-scale iio-value = channel-scale processed = multiplier * iio-value Where iio-value is a set of IIO_VAL_* type + val + val2 integers, being able to handle multiplication of iio-values like this is something which is useful to have in general and, as previous bugfixes to iio_convert_raw_to_processed() have shown, also tricky to implement. Split the iio-value multiplication code from iio_convert_raw_to_processed() out into a new iio_multiply_value() helper. This serves multiple purposes: 1. Having this split out allows writing a KUnit test for this. 2. Having this split out allows re-use to get better precision when scaling values in iio_read_channel_processed_scale(). Reviewed-by: Andy Shevchenko Signed-off-by: Hans de Goede Link: https://patch.msgid.link/20250831104825.15097-4-hansg@kernel.org Signed-off-by: Jonathan Cameron --- drivers/iio/inkern.c | 72 ++++++++++++++++++++++++++------------------ include/linux/iio/consumer.h | 18 +++++++++++ 2 files changed, 60 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/iio/inkern.c b/drivers/iio/inkern.c index 642beb4b3360..158d54de14a7 100644 --- a/drivers/iio/inkern.c +++ b/drivers/iio/inkern.c @@ -599,13 +599,50 @@ int iio_read_channel_average_raw(struct iio_channel *chan, int *val) } EXPORT_SYMBOL_GPL(iio_read_channel_average_raw); +int iio_multiply_value(int *result, s64 multiplier, + unsigned int type, int val, int val2) +{ + s64 denominator; + + switch (type) { + case IIO_VAL_INT: + *result = multiplier * val; + return IIO_VAL_INT; + case IIO_VAL_INT_PLUS_MICRO: + case IIO_VAL_INT_PLUS_NANO: + switch (type) { + case IIO_VAL_INT_PLUS_MICRO: + denominator = MICRO; + break; + case IIO_VAL_INT_PLUS_NANO: + denominator = NANO; + break; + } + *result = multiplier * abs(val); + *result += div_s64(multiplier * abs(val2), denominator); + if (val < 0 || val2 < 0) + *result *= -1; + return IIO_VAL_INT; + case IIO_VAL_FRACTIONAL: + *result = div_s64(multiplier * val, val2); + return IIO_VAL_INT; + case IIO_VAL_FRACTIONAL_LOG2: + *result = (multiplier * val) >> val2; + return IIO_VAL_INT; + default: + return -EINVAL; + } +} +EXPORT_SYMBOL_NS_GPL(iio_multiply_value, "IIO_UNIT_TEST"); + static int iio_convert_raw_to_processed_unlocked(struct iio_channel *chan, int raw, int *processed, unsigned int scale) { int scale_type, scale_val, scale_val2; int offset_type, offset_val, offset_val2; - s64 denominator, raw64 = raw; + s64 raw64 = raw; + int ret; offset_type = iio_channel_read(chan, &offset_val, &offset_val2, IIO_CHAN_INFO_OFFSET); @@ -644,35 +681,10 @@ static int iio_convert_raw_to_processed_unlocked(struct iio_channel *chan, return 0; } - switch (scale_type) { - case IIO_VAL_INT: - *processed = raw64 * scale_val * scale; - break; - case IIO_VAL_INT_PLUS_MICRO: - case IIO_VAL_INT_PLUS_NANO: - switch (scale_type) { - case IIO_VAL_INT_PLUS_MICRO: - denominator = MICRO; - break; - case IIO_VAL_INT_PLUS_NANO: - denominator = NANO; - break; - } - *processed = raw64 * scale * abs(scale_val); - *processed += div_s64(raw64 * scale * abs(scale_val2), denominator); - if (scale_val < 0 || scale_val2 < 0) - *processed *= -1; - break; - case IIO_VAL_FRACTIONAL: - *processed = div_s64(raw64 * (s64)scale_val * scale, - scale_val2); - break; - case IIO_VAL_FRACTIONAL_LOG2: - *processed = (raw64 * (s64)scale_val * scale) >> scale_val2; - break; - default: - return -EINVAL; - } + ret = iio_multiply_value(processed, raw64 * scale, + scale_type, scale_val, scale_val2); + if (ret < 0) + return ret; return 0; } diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index 6a4479616479..a38b277c2c02 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -381,6 +381,24 @@ int iio_read_channel_offset(struct iio_channel *chan, int *val, int iio_read_channel_scale(struct iio_channel *chan, int *val, int *val2); +/** + * iio_multiply_value() - Multiply an IIO value + * @result: Destination pointer for the multiplication result + * @multiplier: Multiplier. + * @type: One of the IIO_VAL_* constants. This decides how the @val and + * @val2 parameters are interpreted. + * @val: Value being multiplied. + * @val2: Value being multiplied. @val2 use depends on type. + * + * Multiply an IIO value with a s64 multiplier storing the result as + * IIO_VAL_INT. This is typically used for scaling. + * + * Returns: + * IIO_VAL_INT on success or a negative error-number on failure. + */ +int iio_multiply_value(int *result, s64 multiplier, + unsigned int type, int val, int val2); + /** * iio_convert_raw_to_processed() - Converts a raw value to a processed value * @chan: The channel being queried -- cgit v1.2.3 From b2461e20fa9ac18b1305bba5bc7e22ebf644ea01 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 25 Aug 2025 15:00:30 +0800 Subject: firmware: imx: Add stub functions for SCMI MISC API To ensure successful builds when CONFIG_IMX_SCMI_MISC_DRV is not enabled, this patch adds static inline stub implementations for the following functions: - scmi_imx_misc_ctrl_get() - scmi_imx_misc_ctrl_set() These stubs return -EOPNOTSUPP to indicate that the functionality is not supported in the current configuration. This avoids potential build or link errors in code that conditionally calls these functions based on feature availability. This patch also drops the changes in commit 540c830212ed ("firmware: imx: remove duplicate scmi_imx_misc_ctrl_get()"). The original change aimed to simplify the handling of optional features by removing conditional stubs. However, the use of conditional stubs is necessary when CONFIG_IMX_SCMI_MISC_DRV is n, while consumer driver is set to y. This is not a matter of preserving legacy patterns, but rather to ensure that there is no link error whether for module or built-in. Fixes: 0b4f8a68b292 ("firmware: imx: Add i.MX95 MISC driver") Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- include/linux/firmware/imx/sm.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index d4212bc42b2c..99c15bbb46aa 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -26,8 +26,20 @@ #define SCMI_IMX94_CTRL_SAI3_MCLK 5U /*!< WAKE SAI3 MCLK */ #define SCMI_IMX94_CTRL_SAI4_MCLK 6U /*!< WAKE SAI4 MCLK */ +#if IS_ENABLED(CONFIG_IMX_SCMI_MISC_DRV) int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val); int scmi_imx_misc_ctrl_set(u32 id, u32 val); +#else +static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val) +{ + return -EOPNOTSUPP; +} +#endif int scmi_imx_cpu_start(u32 cpuid, bool start); int scmi_imx_cpu_started(u32 cpuid, bool *started); -- cgit v1.2.3 From 3fb91b5c86d0fb5ff6f65c30a4f20193166e22fe Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 25 Aug 2025 15:00:31 +0800 Subject: firmware: imx: Add stub functions for SCMI LMM API To ensure successful builds when CONFIG_IMX_SCMI_LMM_DRV is not enabled, this patch adds static inline stub implementations for the following functions: - scmi_imx_lmm_operation() - scmi_imx_lmm_info() - scmi_imx_lmm_reset_vector_set() These stubs return -EOPNOTSUPP to indicate that the functionality is not supported in the current configuration. This avoids potential build or link errors in code that conditionally calls these functions based on feature availability. Fixes: 7242bbf418f0 ("firmware: imx: Add i.MX95 SCMI LMM driver") Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- include/linux/firmware/imx/sm.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index 99c15bbb46aa..f2a72177bb37 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -56,7 +56,24 @@ enum scmi_imx_lmm_op { #define SCMI_IMX_LMM_OP_FORCEFUL 0 #define SCMI_IMX_LMM_OP_GRACEFUL BIT(0) +#if IS_ENABLED(CONFIG_IMX_SCMI_LMM_DRV) int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags); int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info); int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector); +#else +static inline int scmi_imx_lmm_operation(u32 lmid, enum scmi_imx_lmm_op op, u32 flags) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_lmm_info(u32 lmid, struct scmi_imx_lmm_info *info) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_lmm_reset_vector_set(u32 lmid, u32 cpuid, u32 flags, u64 vector) +{ + return -EOPNOTSUPP; +} +#endif #endif -- cgit v1.2.3 From 222accf05fc42f68ae02065d9c1542c20315118b Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 25 Aug 2025 15:00:32 +0800 Subject: firmware: imx: Add stub functions for SCMI CPU API To ensure successful builds when CONFIG_IMX_SCMI_CPU_DRV is not enabled, this patch adds static inline stub implementations for the following functions: - scmi_imx_cpu_start() - scmi_imx_cpu_started() - scmi_imx_cpu_reset_vector_set() These stubs return -EOPNOTSUPP to indicate that the functionality is not supported in the current configuration. This avoids potential build or link errors in code that conditionally calls these functions based on feature availability. Fixes: 1055faa5d660 ("firmware: imx: Add i.MX95 SCMI CPU driver") Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Signed-off-by: Shawn Guo --- include/linux/firmware/imx/sm.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index f2a72177bb37..a33b45027356 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -41,10 +41,28 @@ static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val) } #endif +#if IS_ENABLED(CONFIG_IMX_SCMI_CPU_DRV) int scmi_imx_cpu_start(u32 cpuid, bool start); int scmi_imx_cpu_started(u32 cpuid, bool *started); int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, bool boot, bool resume); +#else +static inline int scmi_imx_cpu_start(u32 cpuid, bool start) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_cpu_started(u32 cpuid, bool *started) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_cpu_reset_vector_set(u32 cpuid, u64 vector, bool start, + bool boot, bool resume) +{ + return -EOPNOTSUPP; +} +#endif enum scmi_imx_lmm_op { SCMI_IMX_LMM_BOOT, -- cgit v1.2.3 From c924c65f52c300ba36373e140a43a8e723c3abdd Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Wed, 13 Aug 2025 08:02:52 +0200 Subject: tee: implement protected DMA-heap Implement DMA heap for protected DMA-buf allocation in the TEE subsystem. Protected memory refers to memory buffers behind a hardware enforced firewall. It is not accessible to the kernel during normal circumstances but rather only accessible to certain hardware IPs or CPUs executing in higher or differently privileged mode than the kernel itself. This interface allows to allocate and manage such protected memory buffers via interaction with a TEE implementation. The protected memory is allocated for a specific use-case, like Secure Video Playback, Trusted UI, or Secure Video Recording where certain hardware devices can access the memory. The DMA-heaps are enabled explicitly by the TEE backend driver. The TEE backend drivers needs to implement protected memory pool to manage the protected memory. Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/Kconfig | 5 + drivers/tee/Makefile | 1 + drivers/tee/tee_core.c | 4 + drivers/tee/tee_heap.c | 500 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/tee/tee_private.h | 6 + include/linux/tee_core.h | 53 +++++ 6 files changed, 569 insertions(+) create mode 100644 drivers/tee/tee_heap.c (limited to 'include/linux') diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig index 61b507c18780..90600607a9d8 100644 --- a/drivers/tee/Kconfig +++ b/drivers/tee/Kconfig @@ -13,6 +13,11 @@ menuconfig TEE if TEE +config TEE_DMABUF_HEAPS + bool + depends on HAS_DMA && DMABUF_HEAPS + default y + source "drivers/tee/optee/Kconfig" source "drivers/tee/amdtee/Kconfig" source "drivers/tee/tstee/Kconfig" diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile index 5488cba30bd2..949a6a79fb06 100644 --- a/drivers/tee/Makefile +++ b/drivers/tee/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_TEE) += tee.o tee-objs += tee_core.o +tee-objs += tee_heap.o tee-objs += tee_shm.o tee-objs += tee_shm_pool.o obj-$(CONFIG_OPTEE) += optee/ diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index acc7998758ad..2411d1e2aa7a 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -1064,6 +1064,8 @@ void tee_device_unregister(struct tee_device *teedev) if (!teedev) return; + tee_device_put_all_dma_heaps(teedev); + if (teedev->flags & TEE_DEVICE_FLAG_REGISTERED) cdev_device_del(&teedev->cdev, &teedev->dev); @@ -1287,3 +1289,5 @@ MODULE_AUTHOR("Linaro"); MODULE_DESCRIPTION("TEE Driver"); MODULE_VERSION("1.0"); MODULE_LICENSE("GPL v2"); +MODULE_IMPORT_NS("DMA_BUF"); +MODULE_IMPORT_NS("DMA_BUF_HEAP"); diff --git a/drivers/tee/tee_heap.c b/drivers/tee/tee_heap.c new file mode 100644 index 000000000000..d8d7735cdffb --- /dev/null +++ b/drivers/tee/tee_heap.c @@ -0,0 +1,500 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025, Linaro Limited + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tee_private.h" + +struct tee_dma_heap { + struct dma_heap *heap; + enum tee_dma_heap_id id; + struct kref kref; + struct tee_protmem_pool *pool; + struct tee_device *teedev; + bool shutting_down; + /* Protects pool, teedev, and shutting_down above */ + struct mutex mu; +}; + +struct tee_heap_buffer { + struct tee_dma_heap *heap; + size_t size; + size_t offs; + struct sg_table table; +}; + +struct tee_heap_attachment { + struct sg_table table; + struct device *dev; +}; + +struct tee_protmem_static_pool { + struct tee_protmem_pool pool; + struct gen_pool *gen_pool; + phys_addr_t pa_base; +}; + +#if IS_ENABLED(CONFIG_TEE_DMABUF_HEAPS) +static DEFINE_XARRAY_ALLOC(tee_dma_heap); + +static void tee_heap_release(struct kref *kref) +{ + struct tee_dma_heap *h = container_of(kref, struct tee_dma_heap, kref); + + h->pool->ops->destroy_pool(h->pool); + tee_device_put(h->teedev); + h->pool = NULL; + h->teedev = NULL; +} + +static void put_tee_heap(struct tee_dma_heap *h) +{ + kref_put(&h->kref, tee_heap_release); +} + +static void get_tee_heap(struct tee_dma_heap *h) +{ + kref_get(&h->kref); +} + +static int copy_sg_table(struct sg_table *dst, struct sg_table *src) +{ + struct scatterlist *dst_sg; + struct scatterlist *src_sg; + int ret; + int i; + + ret = sg_alloc_table(dst, src->orig_nents, GFP_KERNEL); + if (ret) + return ret; + + dst_sg = dst->sgl; + for_each_sgtable_sg(src, src_sg, i) { + sg_set_page(dst_sg, sg_page(src_sg), src_sg->length, + src_sg->offset); + dst_sg = sg_next(dst_sg); + } + + return 0; +} + +static int tee_heap_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct tee_heap_buffer *buf = dmabuf->priv; + struct tee_heap_attachment *a; + int ret; + + a = kzalloc(sizeof(*a), GFP_KERNEL); + if (!a) + return -ENOMEM; + + ret = copy_sg_table(&a->table, &buf->table); + if (ret) { + kfree(a); + return ret; + } + + a->dev = attachment->dev; + attachment->priv = a; + + return 0; +} + +static void tee_heap_detach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct tee_heap_attachment *a = attachment->priv; + + sg_free_table(&a->table); + kfree(a); +} + +static struct sg_table * +tee_heap_map_dma_buf(struct dma_buf_attachment *attachment, + enum dma_data_direction direction) +{ + struct tee_heap_attachment *a = attachment->priv; + int ret; + + ret = dma_map_sgtable(attachment->dev, &a->table, direction, + DMA_ATTR_SKIP_CPU_SYNC); + if (ret) + return ERR_PTR(ret); + + return &a->table; +} + +static void tee_heap_unmap_dma_buf(struct dma_buf_attachment *attachment, + struct sg_table *table, + enum dma_data_direction direction) +{ + struct tee_heap_attachment *a = attachment->priv; + + WARN_ON(&a->table != table); + + dma_unmap_sgtable(attachment->dev, table, direction, + DMA_ATTR_SKIP_CPU_SYNC); +} + +static void tee_heap_buf_free(struct dma_buf *dmabuf) +{ + struct tee_heap_buffer *buf = dmabuf->priv; + + buf->heap->pool->ops->free(buf->heap->pool, &buf->table); + mutex_lock(&buf->heap->mu); + put_tee_heap(buf->heap); + mutex_unlock(&buf->heap->mu); + kfree(buf); +} + +static const struct dma_buf_ops tee_heap_buf_ops = { + .attach = tee_heap_attach, + .detach = tee_heap_detach, + .map_dma_buf = tee_heap_map_dma_buf, + .unmap_dma_buf = tee_heap_unmap_dma_buf, + .release = tee_heap_buf_free, +}; + +static struct dma_buf *tee_dma_heap_alloc(struct dma_heap *heap, + unsigned long len, u32 fd_flags, + u64 heap_flags) +{ + struct tee_dma_heap *h = dma_heap_get_drvdata(heap); + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); + struct tee_device *teedev = NULL; + struct tee_heap_buffer *buf; + struct tee_protmem_pool *pool; + struct dma_buf *dmabuf; + int rc; + + mutex_lock(&h->mu); + if (h->teedev) { + teedev = h->teedev; + pool = h->pool; + get_tee_heap(h); + } + mutex_unlock(&h->mu); + + if (!teedev) + return ERR_PTR(-EINVAL); + + buf = kzalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) { + dmabuf = ERR_PTR(-ENOMEM); + goto err; + } + buf->size = len; + buf->heap = h; + + rc = pool->ops->alloc(pool, &buf->table, len, &buf->offs); + if (rc) { + dmabuf = ERR_PTR(rc); + goto err_kfree; + } + + exp_info.ops = &tee_heap_buf_ops; + exp_info.size = len; + exp_info.priv = buf; + exp_info.flags = fd_flags; + dmabuf = dma_buf_export(&exp_info); + if (IS_ERR(dmabuf)) + goto err_protmem_free; + + return dmabuf; + +err_protmem_free: + pool->ops->free(pool, &buf->table); +err_kfree: + kfree(buf); +err: + mutex_lock(&h->mu); + put_tee_heap(h); + mutex_unlock(&h->mu); + return dmabuf; +} + +static const struct dma_heap_ops tee_dma_heap_ops = { + .allocate = tee_dma_heap_alloc, +}; + +static const char *heap_id_2_name(enum tee_dma_heap_id id) +{ + switch (id) { + case TEE_DMA_HEAP_SECURE_VIDEO_PLAY: + return "protected,secure-video"; + case TEE_DMA_HEAP_TRUSTED_UI: + return "protected,trusted-ui"; + case TEE_DMA_HEAP_SECURE_VIDEO_RECORD: + return "protected,secure-video-record"; + default: + return NULL; + } +} + +static int alloc_dma_heap(struct tee_device *teedev, enum tee_dma_heap_id id, + struct tee_protmem_pool *pool) +{ + struct dma_heap_export_info exp_info = { + .ops = &tee_dma_heap_ops, + .name = heap_id_2_name(id), + }; + struct tee_dma_heap *h; + int rc; + + if (!exp_info.name) + return -EINVAL; + + if (xa_reserve(&tee_dma_heap, id, GFP_KERNEL)) { + if (!xa_load(&tee_dma_heap, id)) + return -EEXIST; + return -ENOMEM; + } + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return -ENOMEM; + h->id = id; + kref_init(&h->kref); + h->teedev = teedev; + h->pool = pool; + mutex_init(&h->mu); + + exp_info.priv = h; + h->heap = dma_heap_add(&exp_info); + if (IS_ERR(h->heap)) { + rc = PTR_ERR(h->heap); + kfree(h); + + return rc; + } + + /* "can't fail" due to the call to xa_reserve() above */ + return WARN_ON(xa_is_err(xa_store(&tee_dma_heap, id, h, GFP_KERNEL))); +} + +int tee_device_register_dma_heap(struct tee_device *teedev, + enum tee_dma_heap_id id, + struct tee_protmem_pool *pool) +{ + struct tee_dma_heap *h; + int rc; + + if (!tee_device_get(teedev)) + return -EINVAL; + + h = xa_load(&tee_dma_heap, id); + if (h) { + mutex_lock(&h->mu); + if (h->teedev) { + rc = -EBUSY; + } else { + kref_init(&h->kref); + h->shutting_down = false; + h->teedev = teedev; + h->pool = pool; + rc = 0; + } + mutex_unlock(&h->mu); + } else { + rc = alloc_dma_heap(teedev, id, pool); + } + + if (rc) { + tee_device_put(teedev); + dev_err(&teedev->dev, "can't register DMA heap id %d (%s)\n", + id, heap_id_2_name(id)); + } + + return rc; +} +EXPORT_SYMBOL_GPL(tee_device_register_dma_heap); + +void tee_device_put_all_dma_heaps(struct tee_device *teedev) +{ + struct tee_dma_heap *h; + u_long i; + + xa_for_each(&tee_dma_heap, i, h) { + if (h) { + mutex_lock(&h->mu); + if (h->teedev == teedev && !h->shutting_down) { + h->shutting_down = true; + put_tee_heap(h); + } + mutex_unlock(&h->mu); + } + } +} +EXPORT_SYMBOL_GPL(tee_device_put_all_dma_heaps); + +int tee_heap_update_from_dma_buf(struct tee_device *teedev, + struct dma_buf *dmabuf, size_t *offset, + struct tee_shm *shm, + struct tee_shm **parent_shm) +{ + struct tee_heap_buffer *buf; + int rc; + + /* The DMA-buf must be from our heap */ + if (dmabuf->ops != &tee_heap_buf_ops) + return -EINVAL; + + buf = dmabuf->priv; + /* The buffer must be from the same teedev */ + if (buf->heap->teedev != teedev) + return -EINVAL; + + shm->size = buf->size; + + rc = buf->heap->pool->ops->update_shm(buf->heap->pool, &buf->table, + buf->offs, shm, parent_shm); + if (!rc && *parent_shm) + *offset = buf->offs; + + return rc; +} +#else +int tee_device_register_dma_heap(struct tee_device *teedev __always_unused, + enum tee_dma_heap_id id __always_unused, + struct tee_protmem_pool *pool __always_unused) +{ + return -EINVAL; +} +EXPORT_SYMBOL_GPL(tee_device_register_dma_heap); + +void +tee_device_put_all_dma_heaps(struct tee_device *teedev __always_unused) +{ +} +EXPORT_SYMBOL_GPL(tee_device_put_all_dma_heaps); + +int tee_heap_update_from_dma_buf(struct tee_device *teedev __always_unused, + struct dma_buf *dmabuf __always_unused, + size_t *offset __always_unused, + struct tee_shm *shm __always_unused, + struct tee_shm **parent_shm __always_unused) +{ + return -EINVAL; +} +#endif + +static struct tee_protmem_static_pool * +to_protmem_static_pool(struct tee_protmem_pool *pool) +{ + return container_of(pool, struct tee_protmem_static_pool, pool); +} + +static int protmem_pool_op_static_alloc(struct tee_protmem_pool *pool, + struct sg_table *sgt, size_t size, + size_t *offs) +{ + struct tee_protmem_static_pool *stp = to_protmem_static_pool(pool); + phys_addr_t pa; + int ret; + + pa = gen_pool_alloc(stp->gen_pool, size); + if (!pa) + return -ENOMEM; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (ret) { + gen_pool_free(stp->gen_pool, pa, size); + return ret; + } + + sg_set_page(sgt->sgl, phys_to_page(pa), size, 0); + *offs = pa - stp->pa_base; + + return 0; +} + +static void protmem_pool_op_static_free(struct tee_protmem_pool *pool, + struct sg_table *sgt) +{ + struct tee_protmem_static_pool *stp = to_protmem_static_pool(pool); + struct scatterlist *sg; + int i; + + for_each_sgtable_sg(sgt, sg, i) + gen_pool_free(stp->gen_pool, sg_phys(sg), sg->length); + sg_free_table(sgt); +} + +static int protmem_pool_op_static_update_shm(struct tee_protmem_pool *pool, + struct sg_table *sgt, size_t offs, + struct tee_shm *shm, + struct tee_shm **parent_shm) +{ + struct tee_protmem_static_pool *stp = to_protmem_static_pool(pool); + + shm->paddr = stp->pa_base + offs; + *parent_shm = NULL; + + return 0; +} + +static void protmem_pool_op_static_destroy_pool(struct tee_protmem_pool *pool) +{ + struct tee_protmem_static_pool *stp = to_protmem_static_pool(pool); + + gen_pool_destroy(stp->gen_pool); + kfree(stp); +} + +static struct tee_protmem_pool_ops protmem_pool_ops_static = { + .alloc = protmem_pool_op_static_alloc, + .free = protmem_pool_op_static_free, + .update_shm = protmem_pool_op_static_update_shm, + .destroy_pool = protmem_pool_op_static_destroy_pool, +}; + +struct tee_protmem_pool *tee_protmem_static_pool_alloc(phys_addr_t paddr, + size_t size) +{ + const size_t page_mask = PAGE_SIZE - 1; + struct tee_protmem_static_pool *stp; + int rc; + + /* Check it's page aligned */ + if ((paddr | size) & page_mask) + return ERR_PTR(-EINVAL); + + if (!pfn_valid(PHYS_PFN(paddr))) + return ERR_PTR(-EINVAL); + + stp = kzalloc(sizeof(*stp), GFP_KERNEL); + if (!stp) + return ERR_PTR(-ENOMEM); + + stp->gen_pool = gen_pool_create(PAGE_SHIFT, -1); + if (!stp->gen_pool) { + rc = -ENOMEM; + goto err_free; + } + + rc = gen_pool_add(stp->gen_pool, paddr, size, -1); + if (rc) + goto err_free_pool; + + stp->pool.ops = &protmem_pool_ops_static; + stp->pa_base = paddr; + return &stp->pool; + +err_free_pool: + gen_pool_destroy(stp->gen_pool); +err_free: + kfree(stp); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(tee_protmem_static_pool_alloc); diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h index 9bc50605227c..6c6ff5d5eed2 100644 --- a/drivers/tee/tee_private.h +++ b/drivers/tee/tee_private.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -24,4 +25,9 @@ struct tee_shm *tee_shm_alloc_user_buf(struct tee_context *ctx, size_t size); struct tee_shm *tee_shm_register_user_buf(struct tee_context *ctx, unsigned long addr, size_t length); +int tee_heap_update_from_dma_buf(struct tee_device *teedev, + struct dma_buf *dmabuf, size_t *offset, + struct tee_shm *shm, + struct tee_shm **parent_shm); + #endif /*TEE_PRIVATE_H*/ diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index a38494d6b5f4..28b65010b9ed 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -8,9 +8,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -30,6 +32,12 @@ #define TEE_DEVICE_FLAG_REGISTERED 0x1 #define TEE_MAX_DEV_NAME_LEN 32 +enum tee_dma_heap_id { + TEE_DMA_HEAP_SECURE_VIDEO_PLAY = 1, + TEE_DMA_HEAP_TRUSTED_UI, + TEE_DMA_HEAP_SECURE_VIDEO_RECORD, +}; + /** * struct tee_device - TEE Device representation * @name: name of device @@ -116,6 +124,36 @@ struct tee_desc { u32 flags; }; +/** + * struct tee_protmem_pool - protected memory pool + * @ops: operations + * + * This is an abstract interface where this struct is expected to be + * embedded in another struct specific to the implementation. + */ +struct tee_protmem_pool { + const struct tee_protmem_pool_ops *ops; +}; + +/** + * struct tee_protmem_pool_ops - protected memory pool operations + * @alloc: called when allocating protected memory + * @free: called when freeing protected memory + * @update_shm: called when registering a dma-buf to update the @shm + * with physical address of the buffer or to return the + * @parent_shm of the memory pool + * @destroy_pool: called when destroying the pool + */ +struct tee_protmem_pool_ops { + int (*alloc)(struct tee_protmem_pool *pool, struct sg_table *sgt, + size_t size, size_t *offs); + void (*free)(struct tee_protmem_pool *pool, struct sg_table *sgt); + int (*update_shm)(struct tee_protmem_pool *pool, struct sg_table *sgt, + size_t offs, struct tee_shm *shm, + struct tee_shm **parent_shm); + void (*destroy_pool)(struct tee_protmem_pool *pool); +}; + /** * tee_device_alloc() - Allocate a new struct tee_device instance * @teedesc: Descriptor for this driver @@ -154,6 +192,11 @@ int tee_device_register(struct tee_device *teedev); */ void tee_device_unregister(struct tee_device *teedev); +int tee_device_register_dma_heap(struct tee_device *teedev, + enum tee_dma_heap_id id, + struct tee_protmem_pool *pool); +void tee_device_put_all_dma_heaps(struct tee_device *teedev); + /** * tee_device_set_dev_groups() - Set device attribute groups * @teedev: Device to register @@ -229,6 +272,16 @@ static inline void tee_shm_pool_free(struct tee_shm_pool *pool) pool->ops->destroy_pool(pool); } +/** + * tee_protmem_static_pool_alloc() - Create a protected memory manager + * @paddr: Physical address of start of pool + * @size: Size in bytes of the pool + * + * @returns pointer to a 'struct tee_protmem_pool' or an ERR_PTR on failure. + */ +struct tee_protmem_pool *tee_protmem_static_pool_alloc(phys_addr_t paddr, + size_t size); + /** * tee_get_drvdata() - Return driver_data pointer * @returns the driver_data pointer supplied to tee_register(). -- cgit v1.2.3 From 146bf4e75ecab9759ed78c9d167e860042d627fb Mon Sep 17 00:00:00 2001 From: Etienne Carriere Date: Wed, 13 Aug 2025 08:02:54 +0200 Subject: tee: new ioctl to a register tee_shm from a dmabuf file descriptor Add a userspace API to create a tee_shm object that refers to a dmabuf reference. Userspace registers the dmabuf file descriptor as in a tee_shm object. The registration is completed with a tee_shm returned file descriptor. Userspace is free to close the dmabuf file descriptor after it has been registered since all the resources are now held via the new tee_shm object. Closing the tee_shm file descriptor will eventually release all resources used by the tee_shm object when all references are released. The new IOCTL, TEE_IOC_SHM_REGISTER_FD, supports dmabuf references to physically contiguous memory buffers. Dmabuf references acquired from the TEE DMA-heap can be used as protected memory for Secure Video Path and such use cases. It depends on the TEE and the TEE driver if dmabuf references acquired by other means can be used. A new tee_shm flag is added to identify tee_shm objects built from a registered dmabuf, TEE_SHM_DMA_BUF. Signed-off-by: Etienne Carriere Signed-off-by: Olivier Masse Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 62 ++++++++++++++++++++++++++++++++++++++- drivers/tee/tee_private.h | 8 +++++ drivers/tee/tee_shm.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-- include/linux/tee_core.h | 1 + include/linux/tee_drv.h | 10 +++++++ include/uapi/linux/tee.h | 31 ++++++++++++++++++++ 6 files changed, 182 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 4996f194cd9e..cea4da3ea46f 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -354,11 +354,49 @@ tee_ioctl_shm_register(struct tee_context *ctx, return ret; } +static int +tee_ioctl_shm_register_fd(struct tee_context *ctx, + struct tee_ioctl_shm_register_fd_data __user *udata) +{ + struct tee_ioctl_shm_register_fd_data data; + struct tee_shm *shm; + long ret; + + if (copy_from_user(&data, udata, sizeof(data))) + return -EFAULT; + + /* Currently no input flags are supported */ + if (data.flags) + return -EINVAL; + + shm = tee_shm_register_fd(ctx, data.fd); + if (IS_ERR(shm)) + return -EINVAL; + + data.id = shm->id; + data.flags = shm->flags; + data.size = shm->size; + + if (copy_to_user(udata, &data, sizeof(data))) + ret = -EFAULT; + else + ret = tee_shm_get_fd(shm); + + /* + * When user space closes the file descriptor the shared memory + * should be freed or if tee_shm_get_fd() failed then it will + * be freed immediately. + */ + tee_shm_put(shm); + return ret; +} + static int param_from_user_memref(struct tee_context *ctx, struct tee_param_memref *memref, struct tee_ioctl_param *ip) { struct tee_shm *shm; + size_t offs = 0; /* * If a NULL pointer is passed to a TA in the TEE, @@ -389,6 +427,26 @@ static int param_from_user_memref(struct tee_context *ctx, tee_shm_put(shm); return -EINVAL; } + + if (shm->flags & TEE_SHM_DMA_BUF) { + struct tee_shm_dmabuf_ref *ref; + + ref = container_of(shm, struct tee_shm_dmabuf_ref, shm); + if (ref->parent_shm) { + /* + * The shm already has one reference to + * ref->parent_shm so we are clear of 0. + * We're getting another reference since + * this shm will be used in the parameter + * list instead of the shm we got with + * tee_shm_get_from_id() above. + */ + refcount_inc(&ref->parent_shm->refcount); + tee_shm_put(shm); + shm = ref->parent_shm; + offs = ref->offset; + } + } } else if (ctx->cap_memref_null) { /* Pass NULL pointer to OP-TEE */ shm = NULL; @@ -396,7 +454,7 @@ static int param_from_user_memref(struct tee_context *ctx, return -EINVAL; } - memref->shm_offs = ip->a; + memref->shm_offs = ip->a + offs; memref->size = ip->b; memref->shm = shm; @@ -842,6 +900,8 @@ static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return tee_ioctl_shm_alloc(ctx, uarg); case TEE_IOC_SHM_REGISTER: return tee_ioctl_shm_register(ctx, uarg); + case TEE_IOC_SHM_REGISTER_FD: + return tee_ioctl_shm_register_fd(ctx, uarg); case TEE_IOC_OPEN_SESSION: return tee_ioctl_open_session(ctx, uarg); case TEE_IOC_INVOKE: diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h index 6c6ff5d5eed2..a9b5e4a6a8f7 100644 --- a/drivers/tee/tee_private.h +++ b/drivers/tee/tee_private.h @@ -13,6 +13,14 @@ #include #include +/* extra references appended to shm object for registered shared memory */ +struct tee_shm_dmabuf_ref { + struct tee_shm shm; + size_t offset; + struct dma_buf *dmabuf; + struct tee_shm *parent_shm; +}; + int tee_shm_get_fd(struct tee_shm *shm); bool tee_device_get(struct tee_device *teedev); diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index daf6e5cfd59a..76195a398c89 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -4,6 +4,7 @@ */ #include #include +#include #include #include #include @@ -45,7 +46,15 @@ static void release_registered_pages(struct tee_shm *shm) static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) { - if (shm->flags & TEE_SHM_POOL) { + void *p = shm; + + if (shm->flags & TEE_SHM_DMA_BUF) { + struct tee_shm_dmabuf_ref *ref; + + ref = container_of(shm, struct tee_shm_dmabuf_ref, shm); + p = ref; + dma_buf_put(ref->dmabuf); + } else if (shm->flags & TEE_SHM_POOL) { teedev->pool->ops->free(teedev->pool, shm); } else if (shm->flags & TEE_SHM_DYNAMIC) { int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm); @@ -59,7 +68,7 @@ static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) teedev_ctx_put(shm->ctx); - kfree(shm); + kfree(p); tee_device_put(teedev); } @@ -169,7 +178,7 @@ struct tee_shm *tee_shm_alloc_user_buf(struct tee_context *ctx, size_t size) * tee_client_invoke_func(). The memory allocated is later freed with a * call to tee_shm_free(). * - * @returns a pointer to 'struct tee_shm' + * @returns a pointer to 'struct tee_shm' on success, and ERR_PTR on failure */ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size) { @@ -179,6 +188,62 @@ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size) } EXPORT_SYMBOL_GPL(tee_shm_alloc_kernel_buf); +struct tee_shm *tee_shm_register_fd(struct tee_context *ctx, int fd) +{ + struct tee_shm_dmabuf_ref *ref; + int rc; + + if (!tee_device_get(ctx->teedev)) + return ERR_PTR(-EINVAL); + + teedev_ctx_get(ctx); + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) { + rc = -ENOMEM; + goto err_put_tee; + } + + refcount_set(&ref->shm.refcount, 1); + ref->shm.ctx = ctx; + ref->shm.id = -1; + ref->shm.flags = TEE_SHM_DMA_BUF; + + ref->dmabuf = dma_buf_get(fd); + if (IS_ERR(ref->dmabuf)) { + rc = PTR_ERR(ref->dmabuf); + goto err_kfree_ref; + } + + rc = tee_heap_update_from_dma_buf(ctx->teedev, ref->dmabuf, + &ref->offset, &ref->shm, + &ref->parent_shm); + if (rc) + goto err_put_dmabuf; + + mutex_lock(&ref->shm.ctx->teedev->mutex); + ref->shm.id = idr_alloc(&ref->shm.ctx->teedev->idr, &ref->shm, + 1, 0, GFP_KERNEL); + mutex_unlock(&ref->shm.ctx->teedev->mutex); + if (ref->shm.id < 0) { + rc = ref->shm.id; + goto err_put_dmabuf; + } + + return &ref->shm; + +err_put_dmabuf: + dma_buf_put(ref->dmabuf); +err_kfree_ref: + kfree(ref); +err_put_tee: + teedev_ctx_put(ctx); + tee_device_put(ctx->teedev); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(tee_shm_register_fd); + /** * tee_shm_alloc_priv_buf() - Allocate shared memory for a privately shared * kernel buffer @@ -442,6 +507,9 @@ static int tee_shm_fop_mmap(struct file *filp, struct vm_area_struct *vma) /* Refuse sharing shared memory provided by application */ if (shm->flags & TEE_SHM_USER_MAPPED) return -EINVAL; + /* Refuse sharing registered DMA_bufs with the application */ + if (shm->flags & TEE_SHM_DMA_BUF) + return -EINVAL; /* check for overflowing the buffer's size */ if (vma->vm_pgoff + vma_pages(vma) > shm->size >> PAGE_SHIFT) diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index 28b65010b9ed..b6c54b34a8b5 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -28,6 +28,7 @@ #define TEE_SHM_USER_MAPPED BIT(1) /* Memory mapped in user space */ #define TEE_SHM_POOL BIT(2) /* Memory allocated from pool */ #define TEE_SHM_PRIV BIT(3) /* Memory private to TEE driver */ +#define TEE_SHM_DMA_BUF BIT(4) /* Memory with dma-buf handle */ #define TEE_DEVICE_FLAG_REGISTERED 0x1 #define TEE_MAX_DEV_NAME_LEN 32 diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index a54c203000ed..824f1251de60 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -116,6 +116,16 @@ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size); struct tee_shm *tee_shm_register_kernel_buf(struct tee_context *ctx, void *addr, size_t length); +/** + * tee_shm_register_fd() - Register shared memory from file descriptor + * + * @ctx: Context that allocates the shared memory + * @fd: Shared memory file descriptor reference + * + * @returns a pointer to 'struct tee_shm' on success, and ERR_PTR on failure + */ +struct tee_shm *tee_shm_register_fd(struct tee_context *ctx, int fd); + /** * tee_shm_free() - Free shared memory * @shm: Handle to shared memory to free diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index d0430bee8292..d843cf980d98 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -378,6 +378,37 @@ struct tee_ioctl_shm_register_data { __s32 id; }; +/** + * struct tee_ioctl_shm_register_fd_data - Shared memory registering argument + * @fd: [in] File descriptor identifying dmabuf reference + * @size: [out] Size of referenced memory + * @flags: [in] Flags to/from allocation. + * @id: [out] Identifier of the shared memory + * + * The flags field should currently be zero as input. Updated by the call + * with actual flags as defined by TEE_IOCTL_SHM_* above. + * This structure is used as argument for TEE_IOC_SHM_REGISTER_FD below. + */ +struct tee_ioctl_shm_register_fd_data { + __s64 fd; + __u64 size; + __u32 flags; + __s32 id; +}; + +/** + * TEE_IOC_SHM_REGISTER_FD - register a shared memory from a file descriptor + * + * Returns a file descriptor on success or < 0 on failure + * + * The returned file descriptor refers to the shared memory object in the + * kernel. The supplied file deccriptor can be closed if it's not needed + * for other purposes. The shared memory is freed when the descriptor is + * closed. + */ +#define TEE_IOC_SHM_REGISTER_FD _IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 8, \ + struct tee_ioctl_shm_register_fd_data) + /** * TEE_IOC_SHM_REGISTER - Register shared memory argument * -- cgit v1.2.3 From ab09dd6d9201af9930efd5a5a0cb56a0fea6a169 Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Wed, 13 Aug 2025 08:02:55 +0200 Subject: tee: add tee_shm_alloc_dma_mem() Add tee_shm_alloc_dma_mem() to allocate DMA memory. The memory is represented by a tee_shm object using the new flag TEE_SHM_DMA_MEM to identify it as DMA memory. The allocated memory will later be lent to the TEE to be used as protected memory. Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/tee_shm.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/tee_core.h | 5 +++ 2 files changed, 88 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 76195a398c89..e195c892431d 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include #include @@ -13,9 +15,14 @@ #include #include #include -#include #include "tee_private.h" +struct tee_shm_dma_mem { + struct tee_shm shm; + dma_addr_t dma_addr; + struct page *page; +}; + static void shm_put_kernel_pages(struct page **pages, size_t page_count) { size_t n; @@ -48,7 +55,16 @@ static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) { void *p = shm; - if (shm->flags & TEE_SHM_DMA_BUF) { + if (shm->flags & TEE_SHM_DMA_MEM) { +#if IS_ENABLED(CONFIG_TEE_DMABUF_HEAPS) + struct tee_shm_dma_mem *dma_mem; + + dma_mem = container_of(shm, struct tee_shm_dma_mem, shm); + p = dma_mem; + dma_free_pages(&teedev->dev, shm->size, dma_mem->page, + dma_mem->dma_addr, DMA_BIDIRECTIONAL); +#endif + } else if (shm->flags & TEE_SHM_DMA_BUF) { struct tee_shm_dmabuf_ref *ref; ref = container_of(shm, struct tee_shm_dmabuf_ref, shm); @@ -268,6 +284,71 @@ struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size) } EXPORT_SYMBOL_GPL(tee_shm_alloc_priv_buf); +#if IS_ENABLED(CONFIG_TEE_DMABUF_HEAPS) +/** + * tee_shm_alloc_dma_mem() - Allocate DMA memory as shared memory object + * @ctx: Context that allocates the shared memory + * @page_count: Number of pages + * + * The allocated memory is expected to be lent (made inaccessible to the + * kernel) to the TEE while it's used and returned (accessible to the + * kernel again) before it's freed. + * + * This function should normally only be used internally in the TEE + * drivers. + * + * @returns a pointer to 'struct tee_shm' + */ +struct tee_shm *tee_shm_alloc_dma_mem(struct tee_context *ctx, + size_t page_count) +{ + struct tee_device *teedev = ctx->teedev; + struct tee_shm_dma_mem *dma_mem; + dma_addr_t dma_addr; + struct page *page; + + if (!tee_device_get(teedev)) + return ERR_PTR(-EINVAL); + + page = dma_alloc_pages(&teedev->dev, page_count * PAGE_SIZE, + &dma_addr, DMA_BIDIRECTIONAL, GFP_KERNEL); + if (!page) + goto err_put_teedev; + + dma_mem = kzalloc(sizeof(*dma_mem), GFP_KERNEL); + if (!dma_mem) + goto err_free_pages; + + refcount_set(&dma_mem->shm.refcount, 1); + dma_mem->shm.ctx = ctx; + dma_mem->shm.paddr = page_to_phys(page); + dma_mem->dma_addr = dma_addr; + dma_mem->page = page; + dma_mem->shm.size = page_count * PAGE_SIZE; + dma_mem->shm.flags = TEE_SHM_DMA_MEM; + + teedev_ctx_get(ctx); + + return &dma_mem->shm; + +err_free_pages: + dma_free_pages(&teedev->dev, page_count * PAGE_SIZE, page, dma_addr, + DMA_BIDIRECTIONAL); +err_put_teedev: + tee_device_put(teedev); + + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL_GPL(tee_shm_alloc_dma_mem); +#else +struct tee_shm *tee_shm_alloc_dma_mem(struct tee_context *ctx, + size_t page_count) +{ + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL_GPL(tee_shm_alloc_dma_mem); +#endif + int tee_dyn_shm_alloc_helper(struct tee_shm *shm, size_t size, size_t align, int (*shm_register)(struct tee_context *ctx, struct tee_shm *shm, diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index b6c54b34a8b5..7b0c1da2ca6c 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -29,6 +29,8 @@ #define TEE_SHM_POOL BIT(2) /* Memory allocated from pool */ #define TEE_SHM_PRIV BIT(3) /* Memory private to TEE driver */ #define TEE_SHM_DMA_BUF BIT(4) /* Memory with dma-buf handle */ +#define TEE_SHM_DMA_MEM BIT(5) /* Memory allocated with */ + /* dma_alloc_pages() */ #define TEE_DEVICE_FLAG_REGISTERED 0x1 #define TEE_MAX_DEV_NAME_LEN 32 @@ -298,6 +300,9 @@ void *tee_get_drvdata(struct tee_device *teedev); */ struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size); +struct tee_shm *tee_shm_alloc_dma_mem(struct tee_context *ctx, + size_t page_count); + int tee_dyn_shm_alloc_helper(struct tee_shm *shm, size_t size, size_t align, int (*shm_register)(struct tee_context *ctx, struct tee_shm *shm, -- cgit v1.2.3 From 002ebddd695a53999550e241b71950f1aa0e1ac4 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 9 Sep 2025 13:11:20 +0200 Subject: pmdomain: core: Restore behaviour for disabling unused PM domains Recent changes to genpd prevents those PM domains being powered-on during initialization from being powered-off during the boot sequence. Based upon whether CONFIG_PM_CONFIG_PM_GENERIC_DOMAINS_OF is set of not, genpd relies on the sync_state mechanism or the genpd_power_off_unused() (which is a late_initcall_sync), to understand when it's okay to allow these PM domains to be powered-off. This new behaviour in genpd has lead to problems on different platforms. Let's therefore restore the behavior of genpd_power_off_unused(). Moreover, let's introduce GENPD_FLAG_NO_STAY_ON, to allow genpd OF providers to opt-out from the new behaviour. Link: https://lore.kernel.org/all/20250701114733.636510-1-ulf.hansson@linaro.org/ Reported-by: Geert Uytterhoeven Link: https://lore.kernel.org/all/20250902-rk3576-lockup-regression-v1-1-c4a0c9daeb00@collabora.com/ Reported-by: Nicolas Frattaroli Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Fixes: 13a4b7fb6260 ("pmdomain: core: Leave powered-on genpds on until late_initcall_sync") Tested-by: Heiko Stuebner Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Signed-off-by: Ulf Hansson --- drivers/pmdomain/core.c | 20 ++++++++++++++------ include/linux/pm_domain.h | 7 +++++++ 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c index 0006ab3d0789..61c2277c9ce3 100644 --- a/drivers/pmdomain/core.c +++ b/drivers/pmdomain/core.c @@ -187,6 +187,7 @@ static const struct genpd_lock_ops genpd_raw_spin_ops = { #define genpd_is_opp_table_fw(genpd) (genpd->flags & GENPD_FLAG_OPP_TABLE_FW) #define genpd_is_dev_name_fw(genpd) (genpd->flags & GENPD_FLAG_DEV_NAME_FW) #define genpd_is_no_sync_state(genpd) (genpd->flags & GENPD_FLAG_NO_SYNC_STATE) +#define genpd_is_no_stay_on(genpd) (genpd->flags & GENPD_FLAG_NO_STAY_ON) static inline bool irq_safe_dev_in_sleep_domain(struct device *dev, const struct generic_pm_domain *genpd) @@ -1357,7 +1358,6 @@ err_poweroff: return ret; } -#ifndef CONFIG_PM_GENERIC_DOMAINS_OF static bool pd_ignore_unused; static int __init pd_ignore_unused_setup(char *__unused) { @@ -1382,9 +1382,6 @@ static int __init genpd_power_off_unused(void) mutex_lock(&gpd_list_lock); list_for_each_entry(genpd, &gpd_list, gpd_list_node) { - genpd_lock(genpd); - genpd->stay_on = false; - genpd_unlock(genpd); genpd_queue_power_off_work(genpd); } @@ -1393,7 +1390,6 @@ static int __init genpd_power_off_unused(void) return 0; } late_initcall_sync(genpd_power_off_unused); -#endif #ifdef CONFIG_PM_SLEEP @@ -2367,6 +2363,18 @@ static void genpd_lock_init(struct generic_pm_domain *genpd) } } +#ifdef CONFIG_PM_GENERIC_DOMAINS_OF +static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off) +{ + genpd->stay_on = !genpd_is_no_stay_on(genpd) && !is_off; +} +#else +static void genpd_set_stay_on(struct generic_pm_domain *genpd, bool is_off) +{ + genpd->stay_on = false; +} +#endif + /** * pm_genpd_init - Initialize a generic I/O PM domain object. * @genpd: PM domain object to initialize. @@ -2392,7 +2400,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd, INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn); atomic_set(&genpd->sd_count, 0); genpd->status = is_off ? GENPD_STATE_OFF : GENPD_STATE_ON; - genpd->stay_on = !is_off; + genpd_set_stay_on(genpd, is_off); genpd->sync_state = GENPD_SYNC_STATE_OFF; genpd->device_count = 0; genpd->provider = NULL; diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index c84edf217819..f67a2cb7d781 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -115,6 +115,12 @@ struct dev_pm_domain_list { * genpd provider specific way, likely through a * parent device node. This flag makes genpd to * skip its internal support for this. + * + * GENPD_FLAG_NO_STAY_ON: For genpd OF providers a powered-on PM domain at + * initialization is prevented from being + * powered-off until the ->sync_state() callback is + * invoked. This flag informs genpd to allow a + * power-off without waiting for ->sync_state(). */ #define GENPD_FLAG_PM_CLK (1U << 0) #define GENPD_FLAG_IRQ_SAFE (1U << 1) @@ -126,6 +132,7 @@ struct dev_pm_domain_list { #define GENPD_FLAG_OPP_TABLE_FW (1U << 7) #define GENPD_FLAG_DEV_NAME_FW (1U << 8) #define GENPD_FLAG_NO_SYNC_STATE (1U << 9) +#define GENPD_FLAG_NO_STAY_ON (1U << 10) enum gpd_status { GENPD_STATE_ON = 0, /* PM domain is on */ -- cgit v1.2.3 From f0addd325ef692c92c522a2ba4d9db13fc90e664 Mon Sep 17 00:00:00 2001 From: Alexander Kurz Date: Mon, 11 Aug 2025 06:43:58 +0000 Subject: mfd: input: rtc: mc13783: Remove deprecated mc13xxx_irq_ack() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mc13xxx_irq_ack() got deprecated and became dead code with commit 10f9edaeaa30 ("mfd: mc13xxx: Use regmap irq framework for interrupts"). It should be safe to remove it now. Signed-off-by: Alexander Kurz Acked-by: Alexandre Belloni Acked-by: Uwe Kleine-König Acked-by: Dmitry Torokhov # for input Link: https://lore.kernel.org/r/20250811064358.1659-1-akurz@blala.de Signed-off-by: Lee Jones --- drivers/input/misc/mc13783-pwrbutton.c | 1 - drivers/input/touchscreen/mc13783_ts.c | 4 ---- drivers/rtc/rtc-mc13xxx.c | 13 ------------- include/linux/mfd/mc13xxx.h | 6 ------ 4 files changed, 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/misc/mc13783-pwrbutton.c b/drivers/input/misc/mc13783-pwrbutton.c index 1c7faa9b7afe..b83d762ae2e9 100644 --- a/drivers/input/misc/mc13783-pwrbutton.c +++ b/drivers/input/misc/mc13783-pwrbutton.c @@ -57,7 +57,6 @@ static irqreturn_t button_irq(int irq, void *_priv) struct mc13783_pwrb *priv = _priv; int val; - mc13xxx_irq_ack(priv->mc13783, irq); mc13xxx_reg_read(priv->mc13783, MC13783_REG_INTERRUPT_SENSE_1, &val); switch (irq) { diff --git a/drivers/input/touchscreen/mc13783_ts.c b/drivers/input/touchscreen/mc13783_ts.c index 33635da85079..47b8da00027f 100644 --- a/drivers/input/touchscreen/mc13783_ts.c +++ b/drivers/input/touchscreen/mc13783_ts.c @@ -42,8 +42,6 @@ static irqreturn_t mc13783_ts_handler(int irq, void *data) { struct mc13783_ts_priv *priv = data; - mc13xxx_irq_ack(priv->mc13xxx, irq); - /* * Kick off reading coordinates. Note that if work happens already * be queued for future execution (it rearms itself) it will not @@ -137,8 +135,6 @@ static int mc13783_ts_open(struct input_dev *dev) mc13xxx_lock(priv->mc13xxx); - mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_TS); - ret = mc13xxx_irq_request(priv->mc13xxx, MC13XXX_IRQ_TS, mc13783_ts_handler, MC13783_TS_NAME, priv); if (ret) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index e7b87130e624..2494d13fd767 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -137,10 +137,6 @@ static int mc13xxx_rtc_set_time(struct device *dev, struct rtc_time *tm) } if (!priv->valid) { - ret = mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_RTCRST); - if (unlikely(ret)) - goto out; - ret = mc13xxx_irq_unmask(priv->mc13xxx, MC13XXX_IRQ_RTCRST); } @@ -208,10 +204,6 @@ static int mc13xxx_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) if (unlikely(ret)) goto out; - ret = mc13xxx_irq_ack(priv->mc13xxx, MC13XXX_IRQ_TODA); - if (unlikely(ret)) - goto out; - s1970 = rtc_tm_to_time64(&alarm->time); dev_dbg(dev, "%s: %s %lld\n", __func__, alarm->enabled ? "on" : "off", @@ -239,12 +231,9 @@ out: static irqreturn_t mc13xxx_rtc_alarm_handler(int irq, void *dev) { struct mc13xxx_rtc *priv = dev; - struct mc13xxx *mc13xxx = priv->mc13xxx; rtc_update_irq(priv->rtc, 1, RTC_IRQF | RTC_AF); - mc13xxx_irq_ack(mc13xxx, irq); - return IRQ_HANDLED; } @@ -293,8 +282,6 @@ static int __init mc13xxx_rtc_probe(struct platform_device *pdev) mc13xxx_lock(mc13xxx); - mc13xxx_irq_ack(mc13xxx, MC13XXX_IRQ_RTCRST); - ret = mc13xxx_irq_request(mc13xxx, MC13XXX_IRQ_RTCRST, mc13xxx_rtc_reset_handler, DRIVER_NAME, priv); if (ret) diff --git a/include/linux/mfd/mc13xxx.h b/include/linux/mfd/mc13xxx.h index f372926d5894..dd46fe424a80 100644 --- a/include/linux/mfd/mc13xxx.h +++ b/include/linux/mfd/mc13xxx.h @@ -31,12 +31,6 @@ int mc13xxx_adc_do_conversion(struct mc13xxx *mc13xxx, unsigned int mode, unsigned int channel, u8 ato, bool atox, unsigned int *sample); -/* Deprecated calls */ -static inline int mc13xxx_irq_ack(struct mc13xxx *mc13xxx, int irq) -{ - return 0; -} - static inline int mc13xxx_irq_request_nounmask(struct mc13xxx *mc13xxx, int irq, irq_handler_t handler, const char *name, void *dev) -- cgit v1.2.3 From b8cf8fda522d5a37f8948ad8a19a1113cc38710f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 9 Sep 2025 16:30:47 +0200 Subject: fanotify: add watchdog for permission events This is to make it easier to debug issues with AV software, which time and again deadlocks with no indication of where the issue comes from, and the kernel being blamed for the deadlock. Then we need to analyze dumps to prove that the kernel is not in fact at fault. The deadlock comes from recursion: handling the event triggers another permission event, in some roundabout way, obviously, otherwise it would have been found in testing. With this patch a warning is printed when permission event is received by userspace but not answered for more than the timeout specified in /proc/sys/fs/fanotify/watchdog_timeout. The watchdog can be turned off by setting the timeout to zero (which is the default). The timeout is very coarse (T <= t < 2T) but I guess it's good enough for the purpose. Overhead should be minimal. Signed-off-by: Miklos Szeredi Reviewed-by: Amir Goldstein Link: https://patch.msgid.link/20250909143053.112171-1-mszeredi@redhat.com Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.h | 2 + fs/notify/fanotify/fanotify_user.c | 102 +++++++++++++++++++++++++++++++++++++ include/linux/fsnotify_backend.h | 2 + 3 files changed, 106 insertions(+) (limited to 'include/linux') diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index b78308975082..39e60218df7c 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -441,7 +441,9 @@ struct fanotify_perm_event { size_t count; u32 response; /* userspace answer to the event */ unsigned short state; /* state of the event */ + unsigned short watchdog_cnt; /* already scanned by watchdog? */ int fd; /* fd we passed to userspace for this event */ + pid_t recv_pid; /* pid of task receiving the event */ union { struct fanotify_response_info_header hdr; struct fanotify_response_info_audit_rule audit_rule; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 561339b4cf75..1dadda82cae5 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -50,6 +50,7 @@ /* configurable via /proc/sys/fs/fanotify/ */ static int fanotify_max_queued_events __read_mostly; +static int perm_group_timeout __read_mostly; #ifdef CONFIG_SYSCTL @@ -85,6 +86,14 @@ static const struct ctl_table fanotify_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, + { + .procname = "watchdog_timeout", + .data = &perm_group_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, }; static void __init fanotify_sysctls_init(void) @@ -95,6 +104,91 @@ static void __init fanotify_sysctls_init(void) #define fanotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ +static LIST_HEAD(perm_group_list); +static DEFINE_SPINLOCK(perm_group_lock); +static void perm_group_watchdog(struct work_struct *work); +static DECLARE_DELAYED_WORK(perm_group_work, perm_group_watchdog); + +static void perm_group_watchdog_schedule(void) +{ + schedule_delayed_work(&perm_group_work, secs_to_jiffies(perm_group_timeout)); +} + +static void perm_group_watchdog(struct work_struct *work) +{ + struct fsnotify_group *group; + struct fanotify_perm_event *event; + struct task_struct *task; + pid_t failed_pid = 0; + + guard(spinlock)(&perm_group_lock); + if (list_empty(&perm_group_list)) + return; + + list_for_each_entry(group, &perm_group_list, + fanotify_data.perm_grp_list) { + /* + * Ok to test without lock, racing with an addition is + * fine, will deal with it next round + */ + if (list_empty(&group->fanotify_data.access_list)) + continue; + + spin_lock(&group->notification_lock); + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (likely(event->watchdog_cnt == 0)) { + event->watchdog_cnt = 1; + } else if (event->watchdog_cnt == 1) { + /* Report on event only once */ + event->watchdog_cnt = 2; + + /* Do not report same pid repeatedly */ + if (event->recv_pid == failed_pid) + continue; + + failed_pid = event->recv_pid; + rcu_read_lock(); + task = find_task_by_pid_ns(event->recv_pid, + &init_pid_ns); + pr_warn_ratelimited( + "PID %u (%s) failed to respond to fanotify queue for more than %d seconds\n", + event->recv_pid, + task ? task->comm : NULL, + perm_group_timeout); + rcu_read_unlock(); + } + } + spin_unlock(&group->notification_lock); + } + perm_group_watchdog_schedule(); +} + +static void fanotify_perm_watchdog_group_remove(struct fsnotify_group *group) +{ + if (!list_empty(&group->fanotify_data.perm_grp_list)) { + /* Perm event watchdog can no longer scan this group. */ + spin_lock(&perm_group_lock); + list_del_init(&group->fanotify_data.perm_grp_list); + spin_unlock(&perm_group_lock); + } +} + +static void fanotify_perm_watchdog_group_add(struct fsnotify_group *group) +{ + if (!perm_group_timeout) + return; + + spin_lock(&perm_group_lock); + if (list_empty(&group->fanotify_data.perm_grp_list)) { + /* Add to perm_group_list for monitoring by watchdog. */ + if (list_empty(&perm_group_list)) + perm_group_watchdog_schedule(); + list_add_tail(&group->fanotify_data.perm_grp_list, &perm_group_list); + } + spin_unlock(&perm_group_lock); +} + /* * All flags that may be specified in parameter event_f_flags of fanotify_init. * @@ -953,6 +1047,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, spin_lock(&group->notification_lock); list_add_tail(&event->fse.list, &group->fanotify_data.access_list); + FANOTIFY_PERM(event)->recv_pid = current->pid; spin_unlock(&group->notification_lock); } } @@ -1012,6 +1107,8 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ fsnotify_group_stop_queueing(group); + fanotify_perm_watchdog_group_remove(group); + /* * Process all permission events on access_list and notification queue * and simulate reply from userspace. @@ -1465,6 +1562,10 @@ out: fsnotify_group_unlock(group); fsnotify_put_mark(fsn_mark); + + if (!ret && (mask & FANOTIFY_PERM_EVENTS)) + fanotify_perm_watchdog_group_add(group); + return ret; } @@ -1625,6 +1726,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); + INIT_LIST_HEAD(&group->fanotify_data.perm_grp_list); switch (class) { case FAN_CLASS_NOTIF: group->priority = FSNOTIFY_PRIO_NORMAL; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index d4034ddaf392..0d954ea7b179 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -273,6 +273,8 @@ struct fsnotify_group { int f_flags; /* event_f_flags from fanotify_init() */ struct ucounts *ucounts; mempool_t error_events_pool; + /* chained on perm_group_list */ + struct list_head perm_grp_list; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; -- cgit v1.2.3 From 3b1bbfb5fce3ca9fffc92ac1b053b0cfbb1f322b Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Thu, 4 Sep 2025 11:05:27 -0500 Subject: mfd: bq257xx: Add support for BQ25703A core driver The Texas Instruments BQ25703A is an integrated charger manager and boost converter. The MFD driver initializes the device for the regulator driver and power supply driver. Signed-off-by: Chris Morgan Link: https://lore.kernel.org/r/20250904160530.66178-3-macroalpha82@gmail.com Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 11 +++++ drivers/mfd/Makefile | 1 + drivers/mfd/bq257xx.c | 99 +++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/bq257xx.h | 104 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 215 insertions(+) create mode 100644 drivers/mfd/bq257xx.c create mode 100644 include/linux/mfd/bq257xx.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1..768417c97339 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1641,6 +1641,17 @@ config MFD_TI_LMU LM36274. It consists of backlight, LED and regulator driver. It provides consistent device controls for lighting functions. +config MFD_BQ257XX + tristate "TI BQ257XX Buck/Boost Charge Controller" + depends on I2C + select MFD_CORE + select REGMAP_I2C + help + Support Texas Instruments BQ25703 Buck/Boost converter with + charge controller. It consists of regulators that provide + system voltage and OTG voltage, and a charger manager for + batteries containing one or more cells. + config MFD_OMAP_USB_HOST bool "TI OMAP USBHS core and TLL driver" depends on USB_EHCI_HCD_OMAP || USB_OHCI_HCD_OMAP3 diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d..3d700374a42d 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_MFD_SM501) += sm501.o obj-$(CONFIG_ARCH_BCM2835) += bcm2835-pm.o obj-$(CONFIG_MFD_BCM590XX) += bcm590xx.o obj-$(CONFIG_MFD_BD9571MWV) += bd9571mwv.o +obj-$(CONFIG_MFD_BQ257XX) += bq257xx.o obj-$(CONFIG_MFD_CGBC) += cgbc-core.o obj-$(CONFIG_MFD_CROS_EC_DEV) += cros_ec_dev.o obj-$(CONFIG_MFD_CS42L43) += cs42l43.o diff --git a/drivers/mfd/bq257xx.c b/drivers/mfd/bq257xx.c new file mode 100644 index 000000000000..e9d49dac0a16 --- /dev/null +++ b/drivers/mfd/bq257xx.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BQ257XX Core Driver + * Copyright (C) 2025 Chris Morgan + */ + +#include +#include +#include +#include +#include + +static const struct regmap_range bq25703_readonly_reg_ranges[] = { + regmap_reg_range(BQ25703_CHARGER_STATUS, BQ25703_MANUFACT_DEV_ID), +}; + +static const struct regmap_access_table bq25703_writeable_regs = { + .no_ranges = bq25703_readonly_reg_ranges, + .n_no_ranges = ARRAY_SIZE(bq25703_readonly_reg_ranges), +}; + +static const struct regmap_range bq25703_volatile_reg_ranges[] = { + regmap_reg_range(BQ25703_CHARGE_OPTION_0, BQ25703_IIN_HOST), + regmap_reg_range(BQ25703_CHARGER_STATUS, BQ25703_ADC_OPTION), +}; + +static const struct regmap_access_table bq25703_volatile_regs = { + .yes_ranges = bq25703_volatile_reg_ranges, + .n_yes_ranges = ARRAY_SIZE(bq25703_volatile_reg_ranges), +}; + +static const struct regmap_config bq25703_regmap_config = { + .reg_bits = 8, + .val_bits = 16, + .max_register = BQ25703_ADC_OPTION, + .cache_type = REGCACHE_MAPLE, + .wr_table = &bq25703_writeable_regs, + .volatile_table = &bq25703_volatile_regs, + .val_format_endian = REGMAP_ENDIAN_LITTLE, +}; + +static const struct mfd_cell cells[] = { + MFD_CELL_NAME("bq257xx-regulator"), + MFD_CELL_NAME("bq257xx-charger"), +}; + +static int bq257xx_probe(struct i2c_client *client) +{ + struct bq257xx_device *ddata; + int ret; + + ddata = devm_kzalloc(&client->dev, sizeof(*ddata), GFP_KERNEL); + if (!ddata) + return -ENOMEM; + + ddata->client = client; + + ddata->regmap = devm_regmap_init_i2c(client, &bq25703_regmap_config); + if (IS_ERR(ddata->regmap)) { + return dev_err_probe(&client->dev, PTR_ERR(ddata->regmap), + "Failed to allocate register map\n"); + } + + i2c_set_clientdata(client, ddata); + + ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_AUTO, + cells, ARRAY_SIZE(cells), NULL, 0, NULL); + if (ret) + return dev_err_probe(&client->dev, ret, + "Failed to register child devices\n"); + + return ret; +} + +static const struct i2c_device_id bq257xx_i2c_ids[] = { + { "bq25703a" }, + {} +}; +MODULE_DEVICE_TABLE(i2c, bq257xx_i2c_ids); + +static const struct of_device_id bq257xx_of_match[] = { + { .compatible = "ti,bq25703a" }, + {} +}; +MODULE_DEVICE_TABLE(of, bq257xx_of_match); + +static struct i2c_driver bq257xx_driver = { + .driver = { + .name = "bq257xx", + .of_match_table = bq257xx_of_match, + }, + .probe = bq257xx_probe, + .id_table = bq257xx_i2c_ids, +}; +module_i2c_driver(bq257xx_driver); + +MODULE_DESCRIPTION("bq257xx buck/boost/charger driver"); +MODULE_AUTHOR("Chris Morgan "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/bq257xx.h b/include/linux/mfd/bq257xx.h new file mode 100644 index 000000000000..1d6ddc7fb09f --- /dev/null +++ b/include/linux/mfd/bq257xx.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Register definitions for TI BQ257XX + * Copyright (C) 2020 Texas Instruments Incorporated - http://www.ti.com/ + */ + +#define BQ25703_CHARGE_OPTION_0 0x00 +#define BQ25703_CHARGE_CURRENT 0x02 +#define BQ25703_MAX_CHARGE_VOLT 0x04 +#define BQ25703_OTG_VOLT 0x06 +#define BQ25703_OTG_CURRENT 0x08 +#define BQ25703_INPUT_VOLTAGE 0x0a +#define BQ25703_MIN_VSYS 0x0c +#define BQ25703_IIN_HOST 0x0e +#define BQ25703_CHARGER_STATUS 0x20 +#define BQ25703_PROCHOT_STATUS 0x22 +#define BQ25703_IIN_DPM 0x24 +#define BQ25703_ADCIBAT_CHG 0x28 +#define BQ25703_ADCIINCMPIN 0x2a +#define BQ25703_ADCVSYSVBAT 0x2c +#define BQ25703_MANUFACT_DEV_ID 0x2e +#define BQ25703_CHARGE_OPTION_1 0x30 +#define BQ25703_CHARGE_OPTION_2 0x32 +#define BQ25703_CHARGE_OPTION_3 0x34 +#define BQ25703_ADC_OPTION 0x3a + +#define BQ25703_EN_LWPWR BIT(15) +#define BQ25703_WDTMR_ADJ_MASK GENMASK(14, 13) +#define BQ25703_WDTMR_DISABLE 0 +#define BQ25703_WDTMR_5_SEC 1 +#define BQ25703_WDTMR_88_SEC 2 +#define BQ25703_WDTMR_175_SEC 3 + +#define BQ25703_ICHG_MASK GENMASK(12, 6) +#define BQ25703_ICHG_STEP_UA 64000 +#define BQ25703_ICHG_MIN_UA 64000 +#define BQ25703_ICHG_MAX_UA 8128000 + +#define BQ25703_MAX_CHARGE_VOLT_MASK GENMASK(15, 4) +#define BQ25703_VBATREG_STEP_UV 16000 +#define BQ25703_VBATREG_MIN_UV 1024000 +#define BQ25703_VBATREG_MAX_UV 19200000 + +#define BQ25703_OTG_VOLT_MASK GENMASK(13, 6) +#define BQ25703_OTG_VOLT_STEP_UV 64000 +#define BQ25703_OTG_VOLT_MIN_UV 4480000 +#define BQ25703_OTG_VOLT_MAX_UV 20800000 +#define BQ25703_OTG_VOLT_NUM_VOLT 256 + +#define BQ25703_OTG_CUR_MASK GENMASK(14, 8) +#define BQ25703_OTG_CUR_STEP_UA 50000 +#define BQ25703_OTG_CUR_MAX_UA 6350000 + +#define BQ25703_MINVSYS_MASK GENMASK(13, 8) +#define BQ25703_MINVSYS_STEP_UV 256000 +#define BQ25703_MINVSYS_MIN_UV 1024000 +#define BQ25703_MINVSYS_MAX_UV 16128000 + +#define BQ25703_STS_AC_STAT BIT(15) +#define BQ25703_STS_IN_FCHRG BIT(10) +#define BQ25703_STS_IN_PCHRG BIT(9) +#define BQ25703_STS_FAULT_ACOV BIT(7) +#define BQ25703_STS_FAULT_BATOC BIT(6) +#define BQ25703_STS_FAULT_ACOC BIT(5) + +#define BQ25703_IINDPM_MASK GENMASK(14, 8) +#define BQ25703_IINDPM_STEP_UA 50000 +#define BQ25703_IINDPM_MIN_UA 50000 +#define BQ25703_IINDPM_MAX_UA 6400000 +#define BQ25703_IINDPM_DEFAULT_UA 3300000 +#define BQ25703_IINDPM_OFFSET_UA 50000 + +#define BQ25703_ADCIBAT_DISCHG_MASK GENMASK(6, 0) +#define BQ25703_ADCIBAT_CHG_MASK GENMASK(14, 8) +#define BQ25703_ADCIBAT_CHG_STEP_UA 64000 +#define BQ25703_ADCIBAT_DIS_STEP_UA 256000 + +#define BQ25703_ADCIIN GENMASK(15, 8) +#define BQ25703_ADCIINCMPIN_STEP 50000 + +#define BQ25703_ADCVSYS_MASK GENMASK(15, 8) +#define BQ25703_ADCVBAT_MASK GENMASK(7, 0) +#define BQ25703_ADCVSYSVBAT_OFFSET_UV 2880000 +#define BQ25703_ADCVSYSVBAT_STEP 64000 + +#define BQ25703_ADC_CH_MASK GENMASK(7, 0) +#define BQ25703_ADC_CONV_EN BIT(15) +#define BQ25703_ADC_START BIT(14) +#define BQ25703_ADC_FULL_SCALE BIT(13) +#define BQ25703_ADC_CMPIN_EN BIT(7) +#define BQ25703_ADC_VBUS_EN BIT(6) +#define BQ25703_ADC_PSYS_EN BIT(5) +#define BQ25703_ADC_IIN_EN BIT(4) +#define BQ25703_ADC_IDCHG_EN BIT(3) +#define BQ25703_ADC_ICHG_EN BIT(2) +#define BQ25703_ADC_VSYS_EN BIT(1) +#define BQ25703_ADC_VBAT_EN BIT(0) + +#define BQ25703_EN_OTG_MASK BIT(12) + +struct bq257xx_device { + struct i2c_client *client; + struct regmap *regmap; +}; -- cgit v1.2.3 From 948cb194bcb4c01fb4cd029936f0c02b10780394 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 Jul 2025 15:59:24 +0200 Subject: mtd: map: add back asm/barrier.h inclusion The mb() macro is used in this header: In file included from include/linux/mtd/qinfo.h:5, from include/linux/mtd/pfow.h:8, from drivers/mtd/lpddr/lpddr_cmds.c:14: include/linux/mtd/map.h: In function 'inline_map_write': include/linux/mtd/map.h:428:9: error: implicit declaration of function 'mb' [-Wimplicit-function-declaration] Fixes: 56eb7c13b97c ("mtd: map: Don't use "proxy" headers") Signed-off-by: Arnd Bergmann Acked-by: Andy Shevchenko Signed-off-by: Miquel Raynal --- include/linux/mtd/map.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h index 288ef765a44e..75b0b2abc880 100644 --- a/include/linux/mtd/map.h +++ b/include/linux/mtd/map.h @@ -14,6 +14,7 @@ #include #include #include +#include struct device_node; struct module; -- cgit v1.2.3 From 67a529b7d3c50a56c162476509361f4fe11350dd Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 15 Aug 2025 12:40:02 -0500 Subject: include: adi-axi-common: add version check function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a version check function for checking ADI AXI IP core versions. These cores use a semantic versioning scheme, so it is useful to have a version check function that can check the minor version to enable features in driver while maintaining backward compatibility. Signed-off-by: David Lechner Reviewed-by: Nuno Sá Link: https://patch.msgid.link/20250815-spi-axi-spi-enigne-improve-version-checks-v1-1-13bde357d5b6@baylibre.com Signed-off-by: Mark Brown --- include/linux/adi-axi-common.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/adi-axi-common.h b/include/linux/adi-axi-common.h index f64f4ad4beda..37962ba530df 100644 --- a/include/linux/adi-axi-common.h +++ b/include/linux/adi-axi-common.h @@ -8,6 +8,8 @@ * https://wiki.analog.com/resources/fpga/docs/hdl/regmap */ +#include + #ifndef ADI_AXI_COMMON_H_ #define ADI_AXI_COMMON_H_ @@ -21,6 +23,25 @@ #define ADI_AXI_PCORE_VER_MINOR(version) (((version) >> 8) & 0xff) #define ADI_AXI_PCORE_VER_PATCH(version) ((version) & 0xff) +/** + * adi_axi_pcore_ver_gteq() - check if a version is satisfied + * @version: the full version read from the hardware + * @major: the major version to compare against + * @minor: the minor version to compare against + * + * ADI AXI IP Cores use semantic versioning, so this can be used to check for + * feature availability. + * + * Return: true if the version is greater than or equal to the specified + * major and minor version, false otherwise. + */ +static inline bool adi_axi_pcore_ver_gteq(u32 version, u32 major, u32 minor) +{ + return ADI_AXI_PCORE_VER_MAJOR(version) > (major) || + (ADI_AXI_PCORE_VER_MAJOR(version) == (major) && + ADI_AXI_PCORE_VER_MINOR(version) >= (minor)); +} + #define ADI_AXI_INFO_FPGA_TECH(info) (((info) >> 24) & 0xff) #define ADI_AXI_INFO_FPGA_FAMILY(info) (((info) >> 16) & 0xff) #define ADI_AXI_INFO_FPGA_SPEED_GRADE(info) (((info) >> 8) & 0xff) -- cgit v1.2.3 From 413cf5db2fee00fdd69bc62debdbf655f97f4c08 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 12 Aug 2025 06:23:31 +0200 Subject: libie, ice: move fwlog admin queue to libie Copy the code and: - change ICE_AQC to LIBIE_AQC - change ice_aqc to libie_aqc - move definitions outside the structures Reviewed-by: Przemek Kitszel Signed-off-by: Michal Swiatkowski Tested-by: Rinitha S (A Contingent worker at Intel) Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_adminq_cmd.h | 78 ---------------------- drivers/net/ethernet/intel/ice/ice_debugfs.c | 21 +++--- drivers/net/ethernet/intel/ice/ice_fwlog.c | 46 ++++++------- drivers/net/ethernet/intel/ice/ice_fwlog.h | 2 +- include/linux/net/intel/libie/adminq.h | 89 +++++++++++++++++++++++++ 5 files changed, 124 insertions(+), 112 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index caae1780fd37..93aedb35fd17 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -2399,42 +2399,6 @@ struct ice_aqc_event_lan_overflow { u8 reserved[8]; }; -enum ice_aqc_fw_logging_mod { - ICE_AQC_FW_LOG_ID_GENERAL = 0, - ICE_AQC_FW_LOG_ID_CTRL, - ICE_AQC_FW_LOG_ID_LINK, - ICE_AQC_FW_LOG_ID_LINK_TOPO, - ICE_AQC_FW_LOG_ID_DNL, - ICE_AQC_FW_LOG_ID_I2C, - ICE_AQC_FW_LOG_ID_SDP, - ICE_AQC_FW_LOG_ID_MDIO, - ICE_AQC_FW_LOG_ID_ADMINQ, - ICE_AQC_FW_LOG_ID_HDMA, - ICE_AQC_FW_LOG_ID_LLDP, - ICE_AQC_FW_LOG_ID_DCBX, - ICE_AQC_FW_LOG_ID_DCB, - ICE_AQC_FW_LOG_ID_XLR, - ICE_AQC_FW_LOG_ID_NVM, - ICE_AQC_FW_LOG_ID_AUTH, - ICE_AQC_FW_LOG_ID_VPD, - ICE_AQC_FW_LOG_ID_IOSF, - ICE_AQC_FW_LOG_ID_PARSER, - ICE_AQC_FW_LOG_ID_SW, - ICE_AQC_FW_LOG_ID_SCHEDULER, - ICE_AQC_FW_LOG_ID_TXQ, - ICE_AQC_FW_LOG_ID_RSVD, - ICE_AQC_FW_LOG_ID_POST, - ICE_AQC_FW_LOG_ID_WATCHDOG, - ICE_AQC_FW_LOG_ID_TASK_DISPATCH, - ICE_AQC_FW_LOG_ID_MNG, - ICE_AQC_FW_LOG_ID_SYNCE, - ICE_AQC_FW_LOG_ID_HEALTH, - ICE_AQC_FW_LOG_ID_TSDRV, - ICE_AQC_FW_LOG_ID_PFREG, - ICE_AQC_FW_LOG_ID_MDLVER, - ICE_AQC_FW_LOG_ID_MAX, -}; - enum ice_aqc_health_status_mask { ICE_AQC_HEALTH_STATUS_SET_PF_SPECIFIC_MASK = BIT(0), ICE_AQC_HEALTH_STATUS_SET_ALL_PF_MASK = BIT(1), @@ -2516,48 +2480,6 @@ struct ice_aqc_health_status_elem { __le32 internal_data2; }; -/* Set FW Logging configuration (indirect 0xFF30) - * Register for FW Logging (indirect 0xFF31) - * Query FW Logging (indirect 0xFF32) - * FW Log Event (indirect 0xFF33) - */ -struct ice_aqc_fw_log { - u8 cmd_flags; -#define ICE_AQC_FW_LOG_CONF_UART_EN BIT(0) -#define ICE_AQC_FW_LOG_CONF_AQ_EN BIT(1) -#define ICE_AQC_FW_LOG_QUERY_REGISTERED BIT(2) -#define ICE_AQC_FW_LOG_CONF_SET_VALID BIT(3) -#define ICE_AQC_FW_LOG_AQ_REGISTER BIT(0) -#define ICE_AQC_FW_LOG_AQ_QUERY BIT(2) - - u8 rsp_flag; - __le16 fw_rt_msb; - union { - struct { - __le32 fw_rt_lsb; - } sync; - struct { - __le16 log_resolution; -#define ICE_AQC_FW_LOG_MIN_RESOLUTION (1) -#define ICE_AQC_FW_LOG_MAX_RESOLUTION (128) - - __le16 mdl_cnt; - } cfg; - } ops; - __le32 addr_high; - __le32 addr_low; -}; - -/* Response Buffer for: - * Set Firmware Logging Configuration (0xFF30) - * Query FW Logging (0xFF32) - */ -struct ice_aqc_fw_log_cfg_resp { - __le16 module_identifier; - u8 log_level; - u8 rsvd0; -}; - /* Admin Queue command opcodes */ enum ice_adminq_opc { /* AQ commands */ diff --git a/drivers/net/ethernet/intel/ice/ice_debugfs.c b/drivers/net/ethernet/intel/ice/ice_debugfs.c index 161e0571bae7..6f7f6788447e 100644 --- a/drivers/net/ethernet/intel/ice/ice_debugfs.c +++ b/drivers/net/ethernet/intel/ice/ice_debugfs.c @@ -12,10 +12,11 @@ static struct dentry *ice_debugfs_root; /* create a define that has an extra module that doesn't really exist. this * is so we can add a module 'all' to easily enable/disable all the modules */ -#define ICE_NR_FW_LOG_MODULES (ICE_AQC_FW_LOG_ID_MAX + 1) +#define ICE_NR_FW_LOG_MODULES (LIBIE_AQC_FW_LOG_ID_MAX + 1) /* the ordering in this array is important. it matches the ordering of the - * values in the FW so the index is the same value as in ice_aqc_fw_logging_mod + * values in the FW so the index is the same value as in + * libie_aqc_fw_logging_mod */ static const char * const ice_fwlog_module_string[] = { "general", @@ -84,7 +85,7 @@ ice_fwlog_print_module_cfg(struct ice_fwlog_cfg *cfg, int module, { struct ice_fwlog_module_entry *entry; - if (module != ICE_AQC_FW_LOG_ID_MAX) { + if (module != LIBIE_AQC_FW_LOG_ID_MAX) { entry = &cfg->module_entries[module]; seq_printf(s, "\tModule: %s, Log Level: %s\n", @@ -93,7 +94,7 @@ ice_fwlog_print_module_cfg(struct ice_fwlog_cfg *cfg, int module, } else { int i; - for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) { + for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) { entry = &cfg->module_entries[i]; seq_printf(s, "\tModule: %s, Log Level: %s\n", @@ -190,7 +191,7 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, return -EINVAL; } - if (module != ICE_AQC_FW_LOG_ID_MAX) { + if (module != LIBIE_AQC_FW_LOG_ID_MAX) { fwlog->cfg.module_entries[module].log_level = log_level; } else { /* the module 'all' is a shortcut so that we can set @@ -198,7 +199,7 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, */ int i; - for (i = 0; i < ICE_AQC_FW_LOG_ID_MAX; i++) + for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) fwlog->cfg.module_entries[i].log_level = log_level; } @@ -266,11 +267,11 @@ ice_debugfs_nr_messages_write(struct file *filp, const char __user *buf, if (ret) return ret; - if (nr_messages < ICE_AQC_FW_LOG_MIN_RESOLUTION || - nr_messages > ICE_AQC_FW_LOG_MAX_RESOLUTION) { + if (nr_messages < LIBIE_AQC_FW_LOG_MIN_RESOLUTION || + nr_messages > LIBIE_AQC_FW_LOG_MAX_RESOLUTION) { dev_err(dev, "Invalid FW log number of messages %d, value must be between %d - %d\n", - nr_messages, ICE_AQC_FW_LOG_MIN_RESOLUTION, - ICE_AQC_FW_LOG_MAX_RESOLUTION); + nr_messages, LIBIE_AQC_FW_LOG_MIN_RESOLUTION, + LIBIE_AQC_FW_LOG_MAX_RESOLUTION); return -EINVAL; } diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.c b/drivers/net/ethernet/intel/ice/ice_fwlog.c index 9e640e942feb..30175be484a5 100644 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.c +++ b/drivers/net/ethernet/intel/ice/ice_fwlog.c @@ -142,8 +142,8 @@ static bool ice_fwlog_supported(struct ice_fwlog *fwlog) */ static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) { - struct ice_aqc_fw_log_cfg_resp *fw_modules; - struct ice_aqc_fw_log *cmd; + struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aqc_fw_log *cmd; struct libie_aq_desc desc; u16 module_id_cnt; int status; @@ -156,10 +156,10 @@ static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) if (!buf) return -ENOMEM; - ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_query); + ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_query); cmd = libie_aq_raw(&desc); - cmd->cmd_flags = ICE_AQC_FW_LOG_AQ_QUERY; + cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_QUERY; status = fwlog->send_cmd(fwlog->priv, &desc, buf, ICE_AQ_MAX_BUF_LEN); if (status) { @@ -168,26 +168,26 @@ static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) } module_id_cnt = le16_to_cpu(cmd->ops.cfg.mdl_cnt); - if (module_id_cnt < ICE_AQC_FW_LOG_ID_MAX) { + if (module_id_cnt < LIBIE_AQC_FW_LOG_ID_MAX) { dev_dbg(&fwlog->pdev->dev, "FW returned less than the expected number of FW log module IDs\n"); - } else if (module_id_cnt > ICE_AQC_FW_LOG_ID_MAX) { + } else if (module_id_cnt > LIBIE_AQC_FW_LOG_ID_MAX) { dev_dbg(&fwlog->pdev->dev, "FW returned more than expected number of FW log module IDs, setting module_id_cnt to software expected max %u\n", - ICE_AQC_FW_LOG_ID_MAX); - module_id_cnt = ICE_AQC_FW_LOG_ID_MAX; + LIBIE_AQC_FW_LOG_ID_MAX); + module_id_cnt = LIBIE_AQC_FW_LOG_ID_MAX; } cfg->log_resolution = le16_to_cpu(cmd->ops.cfg.log_resolution); - if (cmd->cmd_flags & ICE_AQC_FW_LOG_CONF_AQ_EN) + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_AQ_EN) cfg->options |= ICE_FWLOG_OPTION_ARQ_ENA; - if (cmd->cmd_flags & ICE_AQC_FW_LOG_CONF_UART_EN) + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_UART_EN) cfg->options |= ICE_FWLOG_OPTION_UART_ENA; - if (cmd->cmd_flags & ICE_AQC_FW_LOG_QUERY_REGISTERED) + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_QUERY_REGISTERED) cfg->options |= ICE_FWLOG_OPTION_IS_REGISTERED; - fw_modules = (struct ice_aqc_fw_log_cfg_resp *)buf; + fw_modules = (struct libie_aqc_fw_log_cfg_resp *)buf; for (i = 0; i < module_id_cnt; i++) { - struct ice_aqc_fw_log_cfg_resp *fw_module = &fw_modules[i]; + struct libie_aqc_fw_log_cfg_resp *fw_module = &fw_modules[i]; cfg->module_entries[i].module_id = le16_to_cpu(fw_module->module_identifier); @@ -325,8 +325,8 @@ ice_aq_fwlog_set(struct ice_fwlog *fwlog, struct ice_fwlog_module_entry *entries, u16 num_entries, u16 options, u16 log_resolution) { - struct ice_aqc_fw_log_cfg_resp *fw_modules; - struct ice_aqc_fw_log *cmd; + struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aqc_fw_log *cmd; struct libie_aq_desc desc; int status; int i; @@ -341,19 +341,19 @@ ice_aq_fwlog_set(struct ice_fwlog *fwlog, fw_modules[i].log_level = entries[i].log_level; } - ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_config); + ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_config); desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD); cmd = libie_aq_raw(&desc); - cmd->cmd_flags = ICE_AQC_FW_LOG_CONF_SET_VALID; + cmd->cmd_flags = LIBIE_AQC_FW_LOG_CONF_SET_VALID; cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution); cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries); if (options & ICE_FWLOG_OPTION_ARQ_ENA) - cmd->cmd_flags |= ICE_AQC_FW_LOG_CONF_AQ_EN; + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_AQ_EN; if (options & ICE_FWLOG_OPTION_UART_ENA) - cmd->cmd_flags |= ICE_AQC_FW_LOG_CONF_UART_EN; + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_UART_EN; status = fwlog->send_cmd(fwlog->priv, &desc, fw_modules, sizeof(*fw_modules) * num_entries); @@ -382,7 +382,7 @@ int ice_fwlog_set(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) return -EOPNOTSUPP; return ice_aq_fwlog_set(fwlog, cfg->module_entries, - ICE_AQC_FW_LOG_ID_MAX, cfg->options, + LIBIE_AQC_FW_LOG_ID_MAX, cfg->options, cfg->log_resolution); } @@ -393,14 +393,14 @@ int ice_fwlog_set(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) */ static int ice_aq_fwlog_register(struct ice_fwlog *fwlog, bool reg) { - struct ice_aqc_fw_log *cmd; + struct libie_aqc_fw_log *cmd; struct libie_aq_desc desc; - ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_fw_logs_register); + ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_register); cmd = libie_aq_raw(&desc); if (reg) - cmd->cmd_flags = ICE_AQC_FW_LOG_AQ_REGISTER; + cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_REGISTER; return fwlog->send_cmd(fwlog->priv, &desc, NULL, 0); } diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.h b/drivers/net/ethernet/intel/ice/ice_fwlog.h index 22585ea9ec93..9efa4a83c957 100644 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.h +++ b/drivers/net/ethernet/intel/ice/ice_fwlog.h @@ -29,7 +29,7 @@ struct ice_fwlog_module_entry { struct ice_fwlog_cfg { /* list of modules for configuring log level */ - struct ice_fwlog_module_entry module_entries[ICE_AQC_FW_LOG_ID_MAX]; + struct ice_fwlog_module_entry module_entries[LIBIE_AQC_FW_LOG_ID_MAX]; /* options used to configure firmware logging */ u16 options; #define ICE_FWLOG_OPTION_ARQ_ENA BIT(0) diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h index ba62f703df43..ca2ac88b5709 100644 --- a/include/linux/net/intel/libie/adminq.h +++ b/include/linux/net/intel/libie/adminq.h @@ -222,6 +222,94 @@ struct libie_aqc_list_caps_elem { }; LIBIE_CHECK_STRUCT_LEN(32, libie_aqc_list_caps_elem); +/* Admin Queue command opcodes */ +enum libie_adminq_opc { + /* FW Logging Commands */ + libie_aqc_opc_fw_logs_config = 0xFF30, + libie_aqc_opc_fw_logs_register = 0xFF31, + libie_aqc_opc_fw_logs_query = 0xFF32, + libie_aqc_opc_fw_logs_event = 0xFF33, +}; + +enum libie_aqc_fw_logging_mod { + LIBIE_AQC_FW_LOG_ID_GENERAL = 0, + LIBIE_AQC_FW_LOG_ID_CTRL, + LIBIE_AQC_FW_LOG_ID_LINK, + LIBIE_AQC_FW_LOG_ID_LINK_TOPO, + LIBIE_AQC_FW_LOG_ID_DNL, + LIBIE_AQC_FW_LOG_ID_I2C, + LIBIE_AQC_FW_LOG_ID_SDP, + LIBIE_AQC_FW_LOG_ID_MDIO, + LIBIE_AQC_FW_LOG_ID_ADMINQ, + LIBIE_AQC_FW_LOG_ID_HDMA, + LIBIE_AQC_FW_LOG_ID_LLDP, + LIBIE_AQC_FW_LOG_ID_DCBX, + LIBIE_AQC_FW_LOG_ID_DCB, + LIBIE_AQC_FW_LOG_ID_XLR, + LIBIE_AQC_FW_LOG_ID_NVM, + LIBIE_AQC_FW_LOG_ID_AUTH, + LIBIE_AQC_FW_LOG_ID_VPD, + LIBIE_AQC_FW_LOG_ID_IOSF, + LIBIE_AQC_FW_LOG_ID_PARSER, + LIBIE_AQC_FW_LOG_ID_SW, + LIBIE_AQC_FW_LOG_ID_SCHEDULER, + LIBIE_AQC_FW_LOG_ID_TXQ, + LIBIE_AQC_FW_LOG_ID_RSVD, + LIBIE_AQC_FW_LOG_ID_POST, + LIBIE_AQC_FW_LOG_ID_WATCHDOG, + LIBIE_AQC_FW_LOG_ID_TASK_DISPATCH, + LIBIE_AQC_FW_LOG_ID_MNG, + LIBIE_AQC_FW_LOG_ID_SYNCE, + LIBIE_AQC_FW_LOG_ID_HEALTH, + LIBIE_AQC_FW_LOG_ID_TSDRV, + LIBIE_AQC_FW_LOG_ID_PFREG, + LIBIE_AQC_FW_LOG_ID_MDLVER, + LIBIE_AQC_FW_LOG_ID_MAX, +}; + +/* Set FW Logging configuration (indirect 0xFF30) + * Register for FW Logging (indirect 0xFF31) + * Query FW Logging (indirect 0xFF32) + * FW Log Event (indirect 0xFF33) + */ +#define LIBIE_AQC_FW_LOG_CONF_UART_EN BIT(0) +#define LIBIE_AQC_FW_LOG_CONF_AQ_EN BIT(1) +#define LIBIE_AQC_FW_LOG_QUERY_REGISTERED BIT(2) +#define LIBIE_AQC_FW_LOG_CONF_SET_VALID BIT(3) +#define LIBIE_AQC_FW_LOG_AQ_REGISTER BIT(0) +#define LIBIE_AQC_FW_LOG_AQ_QUERY BIT(2) + +#define LIBIE_AQC_FW_LOG_MIN_RESOLUTION (1) +#define LIBIE_AQC_FW_LOG_MAX_RESOLUTION (128) + +struct libie_aqc_fw_log { + u8 cmd_flags; + + u8 rsp_flag; + __le16 fw_rt_msb; + union { + struct { + __le32 fw_rt_lsb; + } sync; + struct { + __le16 log_resolution; + __le16 mdl_cnt; + } cfg; + } ops; + __le32 addr_high; + __le32 addr_low; +}; + +/* Response Buffer for: + * Set Firmware Logging Configuration (0xFF30) + * Query FW Logging (0xFF32) + */ +struct libie_aqc_fw_log_cfg_resp { + __le16 module_identifier; + u8 log_level; + u8 rsvd0; +}; + /** * struct libie_aq_desc - Admin Queue (AQ) descriptor * @flags: LIBIE_AQ_FLAG_* flags @@ -253,6 +341,7 @@ struct libie_aq_desc { struct libie_aqc_driver_ver driver_ver; struct libie_aqc_req_res res_owner; struct libie_aqc_list_caps get_cap; + struct libie_aqc_fw_log fw_log; } params; }; LIBIE_CHECK_STRUCT_LEN(32, libie_aq_desc); -- cgit v1.2.3 From 02f44dac8930dc7cc43aa3eba872ce35382f6332 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 12 Aug 2025 06:23:33 +0200 Subject: ice: prepare for moving file to libie s/ice/libie There is no function for filling default descriptor in libie. Zero descriptor structure and set opcode without calling the function. Make functions that are caled only in ice_fwlog.c static. Reviewed-by: Przemek Kitszel Signed-off-by: Michal Swiatkowski Tested-by: Rinitha S (A Contingent worker at Intel) Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 6 +- drivers/net/ethernet/intel/ice/ice_fwlog.c | 624 ++++++++++++++-------------- drivers/net/ethernet/intel/ice/ice_fwlog.h | 78 ++-- drivers/net/ethernet/intel/ice/ice_main.c | 4 +- drivers/net/ethernet/intel/ice/ice_type.h | 2 +- include/linux/net/intel/libie/adminq.h | 1 + 6 files changed, 359 insertions(+), 356 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index fd15d7385aaa..1baac0b74caf 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -995,7 +995,7 @@ static int __fwlog_send_cmd(void *priv, struct libie_aq_desc *desc, void *buf, static int __fwlog_init(struct ice_hw *hw) { struct ice_pf *pf = hw->back; - struct ice_fwlog_api api = { + struct libie_fwlog_api api = { .pdev = pf->pdev, .send_cmd = __fwlog_send_cmd, .priv = hw, @@ -1012,7 +1012,7 @@ static int __fwlog_init(struct ice_hw *hw) api.debugfs_root = pf->ice_debugfs_pf; - return ice_fwlog_init(&hw->fwlog, &api); + return libie_fwlog_init(&hw->fwlog, &api); } /** @@ -1197,7 +1197,7 @@ static void __fwlog_deinit(struct ice_hw *hw) return; ice_debugfs_pf_deinit(hw->back); - ice_fwlog_deinit(&hw->fwlog); + libie_fwlog_deinit(&hw->fwlog); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.c b/drivers/net/ethernet/intel/ice/ice_fwlog.c index 036c595cf92b..cb2d3ff203c4 100644 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.c +++ b/drivers/net/ethernet/intel/ice/ice_fwlog.c @@ -12,13 +12,13 @@ /* create a define that has an extra module that doesn't really exist. this * is so we can add a module 'all' to easily enable/disable all the modules */ -#define ICE_NR_FW_LOG_MODULES (LIBIE_AQC_FW_LOG_ID_MAX + 1) +#define LIBIE_NR_FW_LOG_MODULES (LIBIE_AQC_FW_LOG_ID_MAX + 1) /* the ordering in this array is important. it matches the ordering of the * values in the FW so the index is the same value as in * libie_aqc_fw_logging_mod */ -static const char * const ice_fwlog_module_string[] = { +static const char * const libie_fwlog_module_string[] = { "general", "ctrl", "link", @@ -55,9 +55,9 @@ static const char * const ice_fwlog_module_string[] = { }; /* the ordering in this array is important. it matches the ordering of the - * values in the FW so the index is the same value as in ice_fwlog_level + * values in the FW so the index is the same value as in libie_fwlog_level */ -static const char * const ice_fwlog_level_string[] = { +static const char * const libie_fwlog_level_string[] = { "none", "error", "warning", @@ -65,7 +65,7 @@ static const char * const ice_fwlog_level_string[] = { "verbose", }; -static const char * const ice_fwlog_log_size[] = { +static const char * const libie_fwlog_log_size[] = { "128K", "256K", "512K", @@ -73,43 +73,43 @@ static const char * const ice_fwlog_log_size[] = { "2M", }; -static bool ice_fwlog_ring_empty(struct ice_fwlog_ring *rings) +static bool libie_fwlog_ring_empty(struct libie_fwlog_ring *rings) { return rings->head == rings->tail; } -static void ice_fwlog_ring_increment(u16 *item, u16 size) +static void libie_fwlog_ring_increment(u16 *item, u16 size) { *item = (*item + 1) & (size - 1); } -static int ice_fwlog_alloc_ring_buffs(struct ice_fwlog_ring *rings) +static int libie_fwlog_alloc_ring_buffs(struct libie_fwlog_ring *rings) { int i, nr_bytes; u8 *mem; - nr_bytes = rings->size * ICE_AQ_MAX_BUF_LEN; + nr_bytes = rings->size * LIBIE_AQ_MAX_BUF_LEN; mem = vzalloc(nr_bytes); if (!mem) return -ENOMEM; for (i = 0; i < rings->size; i++) { - struct ice_fwlog_data *ring = &rings->rings[i]; + struct libie_fwlog_data *ring = &rings->rings[i]; - ring->data_size = ICE_AQ_MAX_BUF_LEN; + ring->data_size = LIBIE_AQ_MAX_BUF_LEN; ring->data = mem; - mem += ICE_AQ_MAX_BUF_LEN; + mem += LIBIE_AQ_MAX_BUF_LEN; } return 0; } -static void ice_fwlog_free_ring_buffs(struct ice_fwlog_ring *rings) +static void libie_fwlog_free_ring_buffs(struct libie_fwlog_ring *rings) { int i; for (i = 0; i < rings->size; i++) { - struct ice_fwlog_data *ring = &rings->rings[i]; + struct libie_fwlog_data *ring = &rings->rings[i]; /* the first ring is the base memory for the whole range so * free it @@ -122,16 +122,16 @@ static void ice_fwlog_free_ring_buffs(struct ice_fwlog_ring *rings) } } -#define ICE_FWLOG_INDEX_TO_BYTES(n) ((128 * 1024) << (n)) +#define LIBIE_FWLOG_INDEX_TO_BYTES(n) ((128 * 1024) << (n)) /** - * ice_fwlog_realloc_rings - reallocate the FW log rings + * libie_fwlog_realloc_rings - reallocate the FW log rings * @fwlog: pointer to the fwlog structure * @index: the new index to use to allocate memory for the log data * */ -static void ice_fwlog_realloc_rings(struct ice_fwlog *fwlog, int index) +static void libie_fwlog_realloc_rings(struct libie_fwlog *fwlog, int index) { - struct ice_fwlog_ring ring; + struct libie_fwlog_ring ring; int status, ring_size; /* convert the number of bytes into a number of 4K buffers. externally @@ -143,7 +143,7 @@ static void ice_fwlog_realloc_rings(struct ice_fwlog *fwlog, int index) * the user the driver knows that the data is correct and the FW log * can be correctly parsed by the tools */ - ring_size = ICE_FWLOG_INDEX_TO_BYTES(index) / ICE_AQ_MAX_BUF_LEN; + ring_size = LIBIE_FWLOG_INDEX_TO_BYTES(index) / LIBIE_AQ_MAX_BUF_LEN; if (ring_size == fwlog->ring.size) return; @@ -157,15 +157,15 @@ static void ice_fwlog_realloc_rings(struct ice_fwlog *fwlog, int index) ring.size = ring_size; - status = ice_fwlog_alloc_ring_buffs(&ring); + status = libie_fwlog_alloc_ring_buffs(&ring); if (status) { dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); - ice_fwlog_free_ring_buffs(&ring); + libie_fwlog_free_ring_buffs(&ring); kfree(ring.rings); return; } - ice_fwlog_free_ring_buffs(&fwlog->ring); + libie_fwlog_free_ring_buffs(&fwlog->ring); kfree(fwlog->ring.rings); fwlog->ring.rings = ring.rings; @@ -176,23 +176,174 @@ static void ice_fwlog_realloc_rings(struct ice_fwlog *fwlog, int index) } /** - * ice_fwlog_print_module_cfg - print current FW logging module configuration + * libie_fwlog_supported - Cached for whether FW supports FW logging or not + * @fwlog: pointer to the fwlog structure + * + * This will always return false if called before libie_init_hw(), so it must be + * called after libie_init_hw(). + */ +static bool libie_fwlog_supported(struct libie_fwlog *fwlog) +{ + return fwlog->supported; +} + +/** + * libie_aq_fwlog_set - Set FW logging configuration AQ command (0xFF30) + * @fwlog: pointer to the fwlog structure + * @entries: entries to configure + * @num_entries: number of @entries + * @options: options from libie_fwlog_cfg->options structure + * @log_resolution: logging resolution + */ +static int +libie_aq_fwlog_set(struct libie_fwlog *fwlog, + struct libie_fwlog_module_entry *entries, u16 num_entries, + u16 options, u16 log_resolution) +{ + struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aq_desc desc = {0}; + struct libie_aqc_fw_log *cmd; + int status; + int i; + + fw_modules = kcalloc(num_entries, sizeof(*fw_modules), GFP_KERNEL); + if (!fw_modules) + return -ENOMEM; + + for (i = 0; i < num_entries; i++) { + fw_modules[i].module_identifier = + cpu_to_le16(entries[i].module_id); + fw_modules[i].log_level = entries[i].log_level; + } + + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_config); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI) | + cpu_to_le16(LIBIE_AQ_FLAG_RD); + + cmd = libie_aq_raw(&desc); + + cmd->cmd_flags = LIBIE_AQC_FW_LOG_CONF_SET_VALID; + cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution); + cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries); + + if (options & LIBIE_FWLOG_OPTION_ARQ_ENA) + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_AQ_EN; + if (options & LIBIE_FWLOG_OPTION_UART_ENA) + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_UART_EN; + + status = fwlog->send_cmd(fwlog->priv, &desc, fw_modules, + sizeof(*fw_modules) * num_entries); + + kfree(fw_modules); + + return status; +} + +/** + * libie_fwlog_set - Set the firmware logging settings + * @fwlog: pointer to the fwlog structure + * @cfg: config used to set firmware logging + * + * This function should be called whenever the driver needs to set the firmware + * logging configuration. It can be called on initialization, reset, or during + * runtime. + * + * If the PF wishes to receive FW logging then it must register via + * libie_fwlog_register. Note, that libie_fwlog_register does not need to be called + * for init. + */ +static int libie_fwlog_set(struct libie_fwlog *fwlog, + struct libie_fwlog_cfg *cfg) +{ + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + return libie_aq_fwlog_set(fwlog, cfg->module_entries, + LIBIE_AQC_FW_LOG_ID_MAX, cfg->options, + cfg->log_resolution); +} + +/** + * libie_aq_fwlog_register - Register PF for firmware logging events (0xFF31) + * @fwlog: pointer to the fwlog structure + * @reg: true to register and false to unregister + */ +static int libie_aq_fwlog_register(struct libie_fwlog *fwlog, bool reg) +{ + struct libie_aq_desc desc = {0}; + struct libie_aqc_fw_log *cmd; + + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_register); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); + cmd = libie_aq_raw(&desc); + + if (reg) + cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_REGISTER; + + return fwlog->send_cmd(fwlog->priv, &desc, NULL, 0); +} + +/** + * libie_fwlog_register - Register the PF for firmware logging + * @fwlog: pointer to the fwlog structure + * + * After this call the PF will start to receive firmware logging based on the + * configuration set in libie_fwlog_set. + */ +int libie_fwlog_register(struct libie_fwlog *fwlog) +{ + int status; + + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + status = libie_aq_fwlog_register(fwlog, true); + if (status) + dev_dbg(&fwlog->pdev->dev, "Failed to register for firmware logging events over ARQ\n"); + else + fwlog->cfg.options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; + + return status; +} + +/** + * libie_fwlog_unregister - Unregister the PF from firmware logging + * @fwlog: pointer to the fwlog structure + */ +static int libie_fwlog_unregister(struct libie_fwlog *fwlog) +{ + int status; + + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + status = libie_aq_fwlog_register(fwlog, false); + if (status) + dev_dbg(&fwlog->pdev->dev, "Failed to unregister from firmware logging events over ARQ\n"); + else + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_IS_REGISTERED; + + return status; +} + +/** + * libie_fwlog_print_module_cfg - print current FW logging module configuration * @cfg: pointer to the fwlog cfg structure * @module: module to print * @s: the seq file to put data into */ static void -ice_fwlog_print_module_cfg(struct ice_fwlog_cfg *cfg, int module, - struct seq_file *s) +libie_fwlog_print_module_cfg(struct libie_fwlog_cfg *cfg, int module, + struct seq_file *s) { - struct ice_fwlog_module_entry *entry; + struct libie_fwlog_module_entry *entry; if (module != LIBIE_AQC_FW_LOG_ID_MAX) { entry = &cfg->module_entries[module]; seq_printf(s, "\tModule: %s, Log Level: %s\n", - ice_fwlog_module_string[entry->module_id], - ice_fwlog_level_string[entry->log_level]); + libie_fwlog_module_string[entry->module_id], + libie_fwlog_level_string[entry->log_level]); } else { int i; @@ -200,19 +351,19 @@ ice_fwlog_print_module_cfg(struct ice_fwlog_cfg *cfg, int module, entry = &cfg->module_entries[i]; seq_printf(s, "\tModule: %s, Log Level: %s\n", - ice_fwlog_module_string[entry->module_id], - ice_fwlog_level_string[entry->log_level]); + libie_fwlog_module_string[entry->module_id], + libie_fwlog_level_string[entry->log_level]); } } } -static int ice_find_module_by_dentry(struct dentry **modules, struct dentry *d) +static int libie_find_module_by_dentry(struct dentry **modules, struct dentry *d) { int i, module; module = -1; /* find the module based on the dentry */ - for (i = 0; i < ICE_NR_FW_LOG_MODULES; i++) { + for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { if (d == modules[i]) { module = i; break; @@ -223,47 +374,47 @@ static int ice_find_module_by_dentry(struct dentry **modules, struct dentry *d) } /** - * ice_debugfs_module_show - read from 'module' file + * libie_debugfs_module_show - read from 'module' file * @s: the opened file * @v: pointer to the offset */ -static int ice_debugfs_module_show(struct seq_file *s, void *v) +static int libie_debugfs_module_show(struct seq_file *s, void *v) { - struct ice_fwlog *fwlog = s->private; + struct libie_fwlog *fwlog = s->private; const struct file *filp = s->file; struct dentry *dentry; int module; dentry = file_dentry(filp); - module = ice_find_module_by_dentry(fwlog->debugfs_modules, dentry); + module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); if (module < 0) { dev_info(&fwlog->pdev->dev, "unknown module\n"); return -EINVAL; } - ice_fwlog_print_module_cfg(&fwlog->cfg, module, s); + libie_fwlog_print_module_cfg(&fwlog->cfg, module, s); return 0; } -static int ice_debugfs_module_open(struct inode *inode, struct file *filp) +static int libie_debugfs_module_open(struct inode *inode, struct file *filp) { - return single_open(filp, ice_debugfs_module_show, inode->i_private); + return single_open(filp, libie_debugfs_module_show, inode->i_private); } /** - * ice_debugfs_module_write - write into 'module' file + * libie_debugfs_module_write - write into 'module' file * @filp: the opened file * @buf: where to find the user's data * @count: the length of the user's data * @ppos: file position offset */ static ssize_t -ice_debugfs_module_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) +libie_debugfs_module_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) { - struct ice_fwlog *fwlog = file_inode(filp)->i_private; + struct libie_fwlog *fwlog = file_inode(filp)->i_private; struct dentry *dentry = file_dentry(filp); struct device *dev = &fwlog->pdev->dev; char user_val[16], *cmd_buf; @@ -277,7 +428,7 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, if (IS_ERR(cmd_buf)) return PTR_ERR(cmd_buf); - module = ice_find_module_by_dentry(fwlog->debugfs_modules, dentry); + module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); if (module < 0) { dev_info(dev, "unknown module\n"); return -EINVAL; @@ -287,7 +438,7 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, if (cnt != 1) return -EINVAL; - log_level = sysfs_match_string(ice_fwlog_level_string, user_val); + log_level = sysfs_match_string(libie_fwlog_level_string, user_val); if (log_level < 0) { dev_info(dev, "unknown log level '%s'\n", user_val); return -EINVAL; @@ -308,26 +459,26 @@ ice_debugfs_module_write(struct file *filp, const char __user *buf, return count; } -static const struct file_operations ice_debugfs_module_fops = { +static const struct file_operations libie_debugfs_module_fops = { .owner = THIS_MODULE, - .open = ice_debugfs_module_open, + .open = libie_debugfs_module_open, .read = seq_read, .release = single_release, - .write = ice_debugfs_module_write, + .write = libie_debugfs_module_write, }; /** - * ice_debugfs_nr_messages_read - read from 'nr_messages' file + * libie_debugfs_nr_messages_read - read from 'nr_messages' file * @filp: the opened file * @buffer: where to write the data for the user to read * @count: the size of the user's buffer * @ppos: file position offset */ -static ssize_t ice_debugfs_nr_messages_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) +static ssize_t libie_debugfs_nr_messages_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; char buff[32] = {}; snprintf(buff, sizeof(buff), "%d\n", @@ -337,17 +488,17 @@ static ssize_t ice_debugfs_nr_messages_read(struct file *filp, } /** - * ice_debugfs_nr_messages_write - write into 'nr_messages' file + * libie_debugfs_nr_messages_write - write into 'nr_messages' file * @filp: the opened file * @buf: where to find the user's data * @count: the length of the user's data * @ppos: file position offset */ static ssize_t -ice_debugfs_nr_messages_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) +libie_debugfs_nr_messages_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; struct device *dev = &fwlog->pdev->dev; char user_val[8], *cmd_buf; s16 nr_messages; @@ -382,46 +533,46 @@ ice_debugfs_nr_messages_write(struct file *filp, const char __user *buf, return count; } -static const struct file_operations ice_debugfs_nr_messages_fops = { +static const struct file_operations libie_debugfs_nr_messages_fops = { .owner = THIS_MODULE, .open = simple_open, - .read = ice_debugfs_nr_messages_read, - .write = ice_debugfs_nr_messages_write, + .read = libie_debugfs_nr_messages_read, + .write = libie_debugfs_nr_messages_write, }; /** - * ice_debugfs_enable_read - read from 'enable' file + * libie_debugfs_enable_read - read from 'enable' file * @filp: the opened file * @buffer: where to write the data for the user to read * @count: the size of the user's buffer * @ppos: file position offset */ -static ssize_t ice_debugfs_enable_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) +static ssize_t libie_debugfs_enable_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; char buff[32] = {}; snprintf(buff, sizeof(buff), "%u\n", (u16)(fwlog->cfg.options & - ICE_FWLOG_OPTION_IS_REGISTERED) >> 3); + LIBIE_FWLOG_OPTION_IS_REGISTERED) >> 3); return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); } /** - * ice_debugfs_enable_write - write into 'enable' file + * libie_debugfs_enable_write - write into 'enable' file * @filp: the opened file * @buf: where to find the user's data * @count: the length of the user's data * @ppos: file position offset */ static ssize_t -ice_debugfs_enable_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) +libie_debugfs_enable_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; char user_val[8], *cmd_buf; bool enable; ssize_t ret; @@ -443,18 +594,18 @@ ice_debugfs_enable_write(struct file *filp, const char __user *buf, goto enable_write_error; if (enable) - fwlog->cfg.options |= ICE_FWLOG_OPTION_ARQ_ENA; + fwlog->cfg.options |= LIBIE_FWLOG_OPTION_ARQ_ENA; else - fwlog->cfg.options &= ~ICE_FWLOG_OPTION_ARQ_ENA; + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; - ret = ice_fwlog_set(fwlog, &fwlog->cfg); + ret = libie_fwlog_set(fwlog, &fwlog->cfg); if (ret) goto enable_write_error; if (enable) - ret = ice_fwlog_register(fwlog); + ret = libie_fwlog_register(fwlog); else - ret = ice_fwlog_unregister(fwlog); + ret = libie_fwlog_unregister(fwlog); if (ret) goto enable_write_error; @@ -475,46 +626,46 @@ enable_write_error: return ret; } -static const struct file_operations ice_debugfs_enable_fops = { +static const struct file_operations libie_debugfs_enable_fops = { .owner = THIS_MODULE, .open = simple_open, - .read = ice_debugfs_enable_read, - .write = ice_debugfs_enable_write, + .read = libie_debugfs_enable_read, + .write = libie_debugfs_enable_write, }; /** - * ice_debugfs_log_size_read - read from 'log_size' file + * libie_debugfs_log_size_read - read from 'log_size' file * @filp: the opened file * @buffer: where to write the data for the user to read * @count: the size of the user's buffer * @ppos: file position offset */ -static ssize_t ice_debugfs_log_size_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) +static ssize_t libie_debugfs_log_size_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; char buff[32] = {}; int index; index = fwlog->ring.index; - snprintf(buff, sizeof(buff), "%s\n", ice_fwlog_log_size[index]); + snprintf(buff, sizeof(buff), "%s\n", libie_fwlog_log_size[index]); return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); } /** - * ice_debugfs_log_size_write - write into 'log_size' file + * libie_debugfs_log_size_write - write into 'log_size' file * @filp: the opened file * @buf: where to find the user's data * @count: the length of the user's data * @ppos: file position offset */ static ssize_t -ice_debugfs_log_size_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) +libie_debugfs_log_size_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; struct device *dev = &fwlog->pdev->dev; char user_val[8], *cmd_buf; ssize_t ret; @@ -532,20 +683,20 @@ ice_debugfs_log_size_write(struct file *filp, const char __user *buf, if (ret != 1) return -EINVAL; - index = sysfs_match_string(ice_fwlog_log_size, user_val); + index = sysfs_match_string(libie_fwlog_log_size, user_val); if (index < 0) { dev_info(dev, "Invalid log size '%s'. The value must be one of 128K, 256K, 512K, 1M, 2M\n", user_val); ret = -EINVAL; goto log_size_write_error; - } else if (fwlog->cfg.options & ICE_FWLOG_OPTION_IS_REGISTERED) { + } else if (fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED) { dev_info(dev, "FW logging is currently running. Please disable FW logging to change log_size\n"); ret = -EINVAL; goto log_size_write_error; } /* free all the buffers and the tracking info and resize */ - ice_fwlog_realloc_rings(fwlog, index); + libie_fwlog_realloc_rings(fwlog, index); /* if we get here, nothing went wrong; return count since we didn't * really write anything @@ -563,32 +714,32 @@ log_size_write_error: return ret; } -static const struct file_operations ice_debugfs_log_size_fops = { +static const struct file_operations libie_debugfs_log_size_fops = { .owner = THIS_MODULE, .open = simple_open, - .read = ice_debugfs_log_size_read, - .write = ice_debugfs_log_size_write, + .read = libie_debugfs_log_size_read, + .write = libie_debugfs_log_size_write, }; /** - * ice_debugfs_data_read - read from 'data' file + * libie_debugfs_data_read - read from 'data' file * @filp: the opened file * @buffer: where to write the data for the user to read * @count: the size of the user's buffer * @ppos: file position offset */ -static ssize_t ice_debugfs_data_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) +static ssize_t libie_debugfs_data_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; int data_copied = 0; bool done = false; - if (ice_fwlog_ring_empty(&fwlog->ring)) + if (libie_fwlog_ring_empty(&fwlog->ring)) return 0; - while (!ice_fwlog_ring_empty(&fwlog->ring) && !done) { - struct ice_fwlog_data *log; + while (!libie_fwlog_ring_empty(&fwlog->ring) && !done) { + struct libie_fwlog_data *log; u16 cur_buf_len; log = &fwlog->ring.rings[fwlog->ring.head]; @@ -610,24 +761,24 @@ static ssize_t ice_debugfs_data_read(struct file *filp, char __user *buffer, buffer += cur_buf_len; count -= cur_buf_len; *ppos += cur_buf_len; - ice_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); + libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); } return data_copied; } /** - * ice_debugfs_data_write - write into 'data' file + * libie_debugfs_data_write - write into 'data' file * @filp: the opened file * @buf: where to find the user's data * @count: the length of the user's data * @ppos: file position offset */ static ssize_t -ice_debugfs_data_write(struct file *filp, const char __user *buf, size_t count, - loff_t *ppos) +libie_debugfs_data_write(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) { - struct ice_fwlog *fwlog = filp->private_data; + struct libie_fwlog *fwlog = filp->private_data; struct device *dev = &fwlog->pdev->dev; ssize_t ret; @@ -638,7 +789,7 @@ ice_debugfs_data_write(struct file *filp, const char __user *buf, size_t count, /* any value is allowed to clear the buffer so no need to even look at * what the value is */ - if (!(fwlog->cfg.options & ICE_FWLOG_OPTION_IS_REGISTERED)) { + if (!(fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED)) { fwlog->ring.head = 0; fwlog->ring.tail = 0; } else { @@ -663,19 +814,20 @@ nr_buffs_write_error: return ret; } -static const struct file_operations ice_debugfs_data_fops = { +static const struct file_operations libie_debugfs_data_fops = { .owner = THIS_MODULE, .open = simple_open, - .read = ice_debugfs_data_read, - .write = ice_debugfs_data_write, + .read = libie_debugfs_data_read, + .write = libie_debugfs_data_write, }; /** - * ice_debugfs_fwlog_init - setup the debugfs directory + * libie_debugfs_fwlog_init - setup the debugfs directory * @fwlog: pointer to the fwlog structure * @root: debugfs root entry on which fwlog director will be registered */ -static void ice_debugfs_fwlog_init(struct ice_fwlog *fwlog, struct dentry *root) +static void libie_debugfs_fwlog_init(struct libie_fwlog *fwlog, + struct dentry *root) { struct dentry *fw_modules_dir; struct dentry **fw_modules; @@ -684,7 +836,7 @@ static void ice_debugfs_fwlog_init(struct ice_fwlog *fwlog, struct dentry *root) /* allocate space for this first because if it fails then we don't * need to unwind */ - fw_modules = kcalloc(ICE_NR_FW_LOG_MODULES, sizeof(*fw_modules), + fw_modules = kcalloc(LIBIE_NR_FW_LOG_MODULES, sizeof(*fw_modules), GFP_KERNEL); if (!fw_modules) return; @@ -697,27 +849,27 @@ static void ice_debugfs_fwlog_init(struct ice_fwlog *fwlog, struct dentry *root) if (IS_ERR(fw_modules_dir)) goto err_create_module_files; - for (i = 0; i < ICE_NR_FW_LOG_MODULES; i++) { - fw_modules[i] = debugfs_create_file(ice_fwlog_module_string[i], + for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { + fw_modules[i] = debugfs_create_file(libie_fwlog_module_string[i], 0600, fw_modules_dir, fwlog, - &ice_debugfs_module_fops); + &libie_debugfs_module_fops); if (IS_ERR(fw_modules[i])) goto err_create_module_files; } debugfs_create_file("nr_messages", 0600, fwlog->debugfs, fwlog, - &ice_debugfs_nr_messages_fops); + &libie_debugfs_nr_messages_fops); fwlog->debugfs_modules = fw_modules; debugfs_create_file("enable", 0600, fwlog->debugfs, fwlog, - &ice_debugfs_enable_fops); + &libie_debugfs_enable_fops); debugfs_create_file("log_size", 0600, fwlog->debugfs, fwlog, - &ice_debugfs_log_size_fops); + &libie_debugfs_log_size_fops); debugfs_create_file("data", 0600, fwlog->debugfs, fwlog, - &ice_debugfs_data_fops); + &libie_debugfs_data_fops); return; @@ -726,7 +878,7 @@ err_create_module_files: kfree(fw_modules); } -static bool ice_fwlog_ring_full(struct ice_fwlog_ring *rings) +static bool libie_fwlog_ring_full(struct libie_fwlog_ring *rings) { u16 head, tail; @@ -742,27 +894,16 @@ static bool ice_fwlog_ring_full(struct ice_fwlog_ring *rings) } /** - * ice_fwlog_supported - Cached for whether FW supports FW logging or not - * @fwlog: pointer to the fwlog structure - * - * This will always return false if called before ice_init_hw(), so it must be - * called after ice_init_hw(). - */ -static bool ice_fwlog_supported(struct ice_fwlog *fwlog) -{ - return fwlog->supported; -} - -/** - * ice_aq_fwlog_get - Get the current firmware logging configuration (0xFF32) + * libie_aq_fwlog_get - Get the current firmware logging configuration (0xFF32) * @fwlog: pointer to the fwlog structure * @cfg: firmware logging configuration to populate */ -static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) +static int libie_aq_fwlog_get(struct libie_fwlog *fwlog, + struct libie_fwlog_cfg *cfg) { struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aq_desc desc = {0}; struct libie_aqc_fw_log *cmd; - struct libie_aq_desc desc; u16 module_id_cnt; int status; void *buf; @@ -770,16 +911,17 @@ static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) memset(cfg, 0, sizeof(*cfg)); - buf = kzalloc(ICE_AQ_MAX_BUF_LEN, GFP_KERNEL); + buf = kzalloc(LIBIE_AQ_MAX_BUF_LEN, GFP_KERNEL); if (!buf) return -ENOMEM; - ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_query); + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_query); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); cmd = libie_aq_raw(&desc); cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_QUERY; - status = fwlog->send_cmd(fwlog->priv, &desc, buf, ICE_AQ_MAX_BUF_LEN); + status = fwlog->send_cmd(fwlog->priv, &desc, buf, LIBIE_AQ_MAX_BUF_LEN); if (status) { dev_dbg(&fwlog->pdev->dev, "Failed to get FW log configuration\n"); goto status_out; @@ -796,11 +938,11 @@ static int ice_aq_fwlog_get(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) cfg->log_resolution = le16_to_cpu(cmd->ops.cfg.log_resolution); if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_AQ_EN) - cfg->options |= ICE_FWLOG_OPTION_ARQ_ENA; + cfg->options |= LIBIE_FWLOG_OPTION_ARQ_ENA; if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_UART_EN) - cfg->options |= ICE_FWLOG_OPTION_UART_ENA; + cfg->options |= LIBIE_FWLOG_OPTION_UART_ENA; if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_QUERY_REGISTERED) - cfg->options |= ICE_FWLOG_OPTION_IS_REGISTERED; + cfg->options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; fw_modules = (struct libie_aqc_fw_log_cfg_resp *)buf; @@ -818,18 +960,18 @@ status_out: } /** - * ice_fwlog_set_supported - Set if FW logging is supported by FW + * libie_fwlog_set_supported - Set if FW logging is supported by FW * @fwlog: pointer to the fwlog structure * - * If FW returns success to the ice_aq_fwlog_get call then it supports FW + * If FW returns success to the libie_aq_fwlog_get call then it supports FW * logging, else it doesn't. Set the fwlog_supported flag accordingly. * * This function is only meant to be called during driver init to determine if * the FW support FW logging. */ -static void ice_fwlog_set_supported(struct ice_fwlog *fwlog) +static void libie_fwlog_set_supported(struct libie_fwlog *fwlog) { - struct ice_fwlog_cfg *cfg; + struct libie_fwlog_cfg *cfg; int status; fwlog->supported = false; @@ -838,9 +980,9 @@ static void ice_fwlog_set_supported(struct ice_fwlog *fwlog) if (!cfg) return; - status = ice_aq_fwlog_get(fwlog, cfg); + status = libie_aq_fwlog_get(fwlog, cfg); if (status) - dev_dbg(&fwlog->pdev->dev, "ice_aq_fwlog_get failed, FW logging is not supported on this version of FW, status %d\n", + dev_dbg(&fwlog->pdev->dev, "libie_aq_fwlog_get failed, FW logging is not supported on this version of FW, status %d\n", status); else fwlog->supported = true; @@ -849,27 +991,27 @@ static void ice_fwlog_set_supported(struct ice_fwlog *fwlog) } /** - * ice_fwlog_init - Initialize FW logging configuration + * libie_fwlog_init - Initialize FW logging configuration * @fwlog: pointer to the fwlog structure * @api: api structure to init fwlog * * This function should be called on driver initialization during - * ice_init_hw(). + * libie_init_hw(). */ -int ice_fwlog_init(struct ice_fwlog *fwlog, struct ice_fwlog_api *api) +int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api) { fwlog->api = *api; - ice_fwlog_set_supported(fwlog); + libie_fwlog_set_supported(fwlog); - if (ice_fwlog_supported(fwlog)) { + if (libie_fwlog_supported(fwlog)) { int status; /* read the current config from the FW and store it */ - status = ice_aq_fwlog_get(fwlog, &fwlog->cfg); + status = libie_aq_fwlog_get(fwlog, &fwlog->cfg); if (status) return status; - fwlog->ring.rings = kcalloc(ICE_FWLOG_RING_SIZE_DFLT, + fwlog->ring.rings = kcalloc(LIBIE_FWLOG_RING_SIZE_DFLT, sizeof(*fwlog->ring.rings), GFP_KERNEL); if (!fwlog->ring.rings) { @@ -877,18 +1019,18 @@ int ice_fwlog_init(struct ice_fwlog *fwlog, struct ice_fwlog_api *api) return -ENOMEM; } - fwlog->ring.size = ICE_FWLOG_RING_SIZE_DFLT; - fwlog->ring.index = ICE_FWLOG_RING_SIZE_INDEX_DFLT; + fwlog->ring.size = LIBIE_FWLOG_RING_SIZE_DFLT; + fwlog->ring.index = LIBIE_FWLOG_RING_SIZE_INDEX_DFLT; - status = ice_fwlog_alloc_ring_buffs(&fwlog->ring); + status = libie_fwlog_alloc_ring_buffs(&fwlog->ring); if (status) { dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); - ice_fwlog_free_ring_buffs(&fwlog->ring); + libie_fwlog_free_ring_buffs(&fwlog->ring); kfree(fwlog->ring.rings); return status; } - ice_debugfs_fwlog_init(fwlog, api->debugfs_root); + libie_debugfs_fwlog_init(fwlog, api->debugfs_root); } else { dev_warn(&fwlog->pdev->dev, "FW logging is not supported in this NVM image. Please update the NVM to get FW log support\n"); } @@ -897,20 +1039,20 @@ int ice_fwlog_init(struct ice_fwlog *fwlog, struct ice_fwlog_api *api) } /** - * ice_fwlog_deinit - unroll FW logging configuration + * libie_fwlog_deinit - unroll FW logging configuration * @fwlog: pointer to the fwlog structure * - * This function should be called in ice_deinit_hw(). + * This function should be called in libie_deinit_hw(). */ -void ice_fwlog_deinit(struct ice_fwlog *fwlog) +void libie_fwlog_deinit(struct libie_fwlog *fwlog) { int status; /* make sure FW logging is disabled to not put the FW in a weird state * for the next driver load */ - fwlog->cfg.options &= ~ICE_FWLOG_OPTION_ARQ_ENA; - status = ice_fwlog_set(fwlog, &fwlog->cfg); + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; + status = libie_fwlog_set(fwlog, &fwlog->cfg); if (status) dev_warn(&fwlog->pdev->dev, "Unable to turn off FW logging, status: %d\n", status); @@ -919,162 +1061,26 @@ void ice_fwlog_deinit(struct ice_fwlog *fwlog) fwlog->debugfs_modules = NULL; - status = ice_fwlog_unregister(fwlog); + status = libie_fwlog_unregister(fwlog); if (status) dev_warn(&fwlog->pdev->dev, "Unable to unregister FW logging, status: %d\n", status); if (fwlog->ring.rings) { - ice_fwlog_free_ring_buffs(&fwlog->ring); + libie_fwlog_free_ring_buffs(&fwlog->ring); kfree(fwlog->ring.rings); } } /** - * ice_aq_fwlog_set - Set FW logging configuration AQ command (0xFF30) - * @fwlog: pointer to the fwlog structure - * @entries: entries to configure - * @num_entries: number of @entries - * @options: options from ice_fwlog_cfg->options structure - * @log_resolution: logging resolution - */ -static int -ice_aq_fwlog_set(struct ice_fwlog *fwlog, - struct ice_fwlog_module_entry *entries, u16 num_entries, - u16 options, u16 log_resolution) -{ - struct libie_aqc_fw_log_cfg_resp *fw_modules; - struct libie_aqc_fw_log *cmd; - struct libie_aq_desc desc; - int status; - int i; - - fw_modules = kcalloc(num_entries, sizeof(*fw_modules), GFP_KERNEL); - if (!fw_modules) - return -ENOMEM; - - for (i = 0; i < num_entries; i++) { - fw_modules[i].module_identifier = - cpu_to_le16(entries[i].module_id); - fw_modules[i].log_level = entries[i].log_level; - } - - ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_config); - desc.flags |= cpu_to_le16(LIBIE_AQ_FLAG_RD); - - cmd = libie_aq_raw(&desc); - - cmd->cmd_flags = LIBIE_AQC_FW_LOG_CONF_SET_VALID; - cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution); - cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries); - - if (options & ICE_FWLOG_OPTION_ARQ_ENA) - cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_AQ_EN; - if (options & ICE_FWLOG_OPTION_UART_ENA) - cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_UART_EN; - - status = fwlog->send_cmd(fwlog->priv, &desc, fw_modules, - sizeof(*fw_modules) * num_entries); - - kfree(fw_modules); - - return status; -} - -/** - * ice_fwlog_set - Set the firmware logging settings - * @fwlog: pointer to the fwlog structure - * @cfg: config used to set firmware logging - * - * This function should be called whenever the driver needs to set the firmware - * logging configuration. It can be called on initialization, reset, or during - * runtime. - * - * If the PF wishes to receive FW logging then it must register via - * ice_fwlog_register. Note, that ice_fwlog_register does not need to be called - * for init. - */ -int ice_fwlog_set(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg) -{ - if (!ice_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - return ice_aq_fwlog_set(fwlog, cfg->module_entries, - LIBIE_AQC_FW_LOG_ID_MAX, cfg->options, - cfg->log_resolution); -} - -/** - * ice_aq_fwlog_register - Register PF for firmware logging events (0xFF31) - * @fwlog: pointer to the fwlog structure - * @reg: true to register and false to unregister - */ -static int ice_aq_fwlog_register(struct ice_fwlog *fwlog, bool reg) -{ - struct libie_aqc_fw_log *cmd; - struct libie_aq_desc desc; - - ice_fill_dflt_direct_cmd_desc(&desc, libie_aqc_opc_fw_logs_register); - cmd = libie_aq_raw(&desc); - - if (reg) - cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_REGISTER; - - return fwlog->send_cmd(fwlog->priv, &desc, NULL, 0); -} - -/** - * ice_fwlog_register - Register the PF for firmware logging - * @fwlog: pointer to the fwlog structure - * - * After this call the PF will start to receive firmware logging based on the - * configuration set in ice_fwlog_set. - */ -int ice_fwlog_register(struct ice_fwlog *fwlog) -{ - int status; - - if (!ice_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - status = ice_aq_fwlog_register(fwlog, true); - if (status) - dev_dbg(&fwlog->pdev->dev, "Failed to register for firmware logging events over ARQ\n"); - else - fwlog->cfg.options |= ICE_FWLOG_OPTION_IS_REGISTERED; - - return status; -} - -/** - * ice_fwlog_unregister - Unregister the PF from firmware logging - * @fwlog: pointer to the fwlog structure - */ -int ice_fwlog_unregister(struct ice_fwlog *fwlog) -{ - int status; - - if (!ice_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - status = ice_aq_fwlog_register(fwlog, false); - if (status) - dev_dbg(&fwlog->pdev->dev, "Failed to unregister from firmware logging events over ARQ\n"); - else - fwlog->cfg.options &= ~ICE_FWLOG_OPTION_IS_REGISTERED; - - return status; -} - -/** - * ice_get_fwlog_data - copy the FW log data from ARQ event + * libie_get_fwlog_data - copy the FW log data from ARQ event * @fwlog: fwlog that the FW log event is associated with * @buf: event buffer pointer * @len: len of event descriptor */ -void ice_get_fwlog_data(struct ice_fwlog *fwlog, u8 *buf, u16 len) +void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len) { - struct ice_fwlog_data *log; + struct libie_fwlog_data *log; log = &fwlog->ring.rings[fwlog->ring.tail]; @@ -1082,10 +1088,10 @@ void ice_get_fwlog_data(struct ice_fwlog *fwlog, u8 *buf, u16 len) log->data_size = len; memcpy(log->data, buf, log->data_size); - ice_fwlog_ring_increment(&fwlog->ring.tail, fwlog->ring.size); + libie_fwlog_ring_increment(&fwlog->ring.tail, fwlog->ring.size); - if (ice_fwlog_ring_full(&fwlog->ring)) { + if (libie_fwlog_ring_full(&fwlog->ring)) { /* the rings are full so bump the head to create room */ - ice_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); + libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); } } diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.h b/drivers/net/ethernet/intel/ice/ice_fwlog.h index d5868b9e4de6..3698759c8ebb 100644 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.h +++ b/drivers/net/ethernet/intel/ice/ice_fwlog.h @@ -1,77 +1,75 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) 2022, Intel Corporation. */ -#ifndef _ICE_FWLOG_H_ -#define _ICE_FWLOG_H_ +#ifndef _LIBIE_FWLOG_H_ +#define _LIBIE_FWLOG_H_ #include "ice_adminq_cmd.h" -struct ice_hw; - /* Only a single log level should be set and all log levels under the set value - * are enabled, e.g. if log level is set to ICE_FW_LOG_LEVEL_VERBOSE, then all - * other log levels are included (except ICE_FW_LOG_LEVEL_NONE) + * are enabled, e.g. if log level is set to LIBIE_FW_LOG_LEVEL_VERBOSE, then all + * other log levels are included (except LIBIE_FW_LOG_LEVEL_NONE) */ -enum ice_fwlog_level { - ICE_FWLOG_LEVEL_NONE = 0, - ICE_FWLOG_LEVEL_ERROR = 1, - ICE_FWLOG_LEVEL_WARNING = 2, - ICE_FWLOG_LEVEL_NORMAL = 3, - ICE_FWLOG_LEVEL_VERBOSE = 4, - ICE_FWLOG_LEVEL_INVALID, /* all values >= this entry are invalid */ +enum libie_fwlog_level { + LIBIE_FWLOG_LEVEL_NONE = 0, + LIBIE_FWLOG_LEVEL_ERROR = 1, + LIBIE_FWLOG_LEVEL_WARNING = 2, + LIBIE_FWLOG_LEVEL_NORMAL = 3, + LIBIE_FWLOG_LEVEL_VERBOSE = 4, + LIBIE_FWLOG_LEVEL_INVALID, /* all values >= this entry are invalid */ }; -struct ice_fwlog_module_entry { +struct libie_fwlog_module_entry { /* module ID for the corresponding firmware logging event */ u16 module_id; /* verbosity level for the module_id */ u8 log_level; }; -struct ice_fwlog_cfg { +struct libie_fwlog_cfg { /* list of modules for configuring log level */ - struct ice_fwlog_module_entry module_entries[LIBIE_AQC_FW_LOG_ID_MAX]; + struct libie_fwlog_module_entry module_entries[LIBIE_AQC_FW_LOG_ID_MAX]; /* options used to configure firmware logging */ u16 options; -#define ICE_FWLOG_OPTION_ARQ_ENA BIT(0) -#define ICE_FWLOG_OPTION_UART_ENA BIT(1) - /* set before calling ice_fwlog_init() so the PF registers for firmware - * logging on initialization +#define LIBIE_FWLOG_OPTION_ARQ_ENA BIT(0) +#define LIBIE_FWLOG_OPTION_UART_ENA BIT(1) + /* set before calling libie_fwlog_init() so the PF registers for + * firmware logging on initialization */ -#define ICE_FWLOG_OPTION_REGISTER_ON_INIT BIT(2) - /* set in the ice_aq_fwlog_get() response if the PF is registered for FW - * logging events over ARQ +#define LIBIE_FWLOG_OPTION_REGISTER_ON_INIT BIT(2) + /* set in the libie_aq_fwlog_get() response if the PF is registered for + * FW logging events over ARQ */ -#define ICE_FWLOG_OPTION_IS_REGISTERED BIT(3) +#define LIBIE_FWLOG_OPTION_IS_REGISTERED BIT(3) /* minimum number of log events sent per Admin Receive Queue event */ u16 log_resolution; }; -struct ice_fwlog_data { +struct libie_fwlog_data { u16 data_size; u8 *data; }; -struct ice_fwlog_ring { - struct ice_fwlog_data *rings; +struct libie_fwlog_ring { + struct libie_fwlog_data *rings; u16 index; u16 size; u16 head; u16 tail; }; -#define ICE_FWLOG_RING_SIZE_INDEX_DFLT 3 -#define ICE_FWLOG_RING_SIZE_DFLT 256 -#define ICE_FWLOG_RING_SIZE_MAX 512 +#define LIBIE_FWLOG_RING_SIZE_INDEX_DFLT 3 +#define LIBIE_FWLOG_RING_SIZE_DFLT 256 +#define LIBIE_FWLOG_RING_SIZE_MAX 512 -struct ice_fwlog { - struct ice_fwlog_cfg cfg; +struct libie_fwlog { + struct libie_fwlog_cfg cfg; bool supported; /* does hardware support FW logging? */ - struct ice_fwlog_ring ring; + struct libie_fwlog_ring ring; struct dentry *debugfs; /* keep track of all the dentrys for FW log modules */ struct dentry **debugfs_modules; - struct_group_tagged(ice_fwlog_api, api, + struct_group_tagged(libie_fwlog_api, api, struct pci_dev *pdev; int (*send_cmd)(void *, struct libie_aq_desc *, void *, u16); void *priv; @@ -79,10 +77,8 @@ struct ice_fwlog { ); }; -int ice_fwlog_init(struct ice_fwlog *fwlog, struct ice_fwlog_api *api); -void ice_fwlog_deinit(struct ice_fwlog *fwlog); -int ice_fwlog_set(struct ice_fwlog *fwlog, struct ice_fwlog_cfg *cfg); -int ice_fwlog_register(struct ice_fwlog *fwlog); -int ice_fwlog_unregister(struct ice_fwlog *fwlog); -void ice_get_fwlog_data(struct ice_fwlog *fwlog, u8 *buf, u16 len); -#endif /* _ICE_FWLOG_H_ */ +int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api); +void libie_fwlog_deinit(struct libie_fwlog *fwlog); +int libie_fwlog_register(struct libie_fwlog *fwlog); +void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len); +#endif /* _LIBIE_FWLOG_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index e7e775f7ea34..44e184c6c416 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -1540,8 +1540,8 @@ static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type) } break; case ice_aqc_opc_fw_logs_event: - ice_get_fwlog_data(&hw->fwlog, event.msg_buf, - le16_to_cpu(event.desc.datalen)); + libie_get_fwlog_data(&hw->fwlog, event.msg_buf, + le16_to_cpu(event.desc.datalen)); break; case ice_aqc_opc_lldp_set_mib_change: ice_dcb_process_lldp_set_mib_change(pf, &event); diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 643d84cc78df..14296bccc4c9 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -948,7 +948,7 @@ struct ice_hw { u8 fw_patch; /* firmware patch version */ u32 fw_build; /* firmware build number */ - struct ice_fwlog fwlog; + struct libie_fwlog fwlog; /* Device max aggregate bandwidths corresponding to the GL_PWR_MODE_CTL * register. Used for determining the ITR/INTRL granularity during diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h index ca2ac88b5709..29420193889a 100644 --- a/include/linux/net/intel/libie/adminq.h +++ b/include/linux/net/intel/libie/adminq.h @@ -9,6 +9,7 @@ #define LIBIE_CHECK_STRUCT_LEN(n, X) \ static_assert((n) == sizeof(struct X)) +#define LIBIE_AQ_MAX_BUF_LEN 4096 /** * struct libie_aqc_generic - Generic structure used in adminq communication -- cgit v1.2.3 From f3b3fc1ff0823b73f6f66b6340e6ebc4b00d2ed3 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 12 Aug 2025 06:23:35 +0200 Subject: ice, libie: move fwlog code to libie Move whole code from ice_fwlog.c/h to libie/fwlog.c/h. Reviewed-by: Przemek Kitszel Signed-off-by: Michal Swiatkowski Tested-by: Rinitha S (A Contingent worker at Intel) Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/Kconfig | 1 + drivers/net/ethernet/intel/ice/Makefile | 1 - drivers/net/ethernet/intel/ice/ice_fwlog.c | 1106 --------------------------- drivers/net/ethernet/intel/ice/ice_fwlog.h | 84 --- drivers/net/ethernet/intel/ice/ice_main.c | 1 + drivers/net/ethernet/intel/ice/ice_type.h | 2 +- drivers/net/ethernet/intel/libie/Kconfig | 9 + drivers/net/ethernet/intel/libie/Makefile | 4 + drivers/net/ethernet/intel/libie/fwlog.c | 1115 ++++++++++++++++++++++++++++ include/linux/net/intel/libie/adminq.h | 6 +- include/linux/net/intel/libie/fwlog.h | 85 +++ 11 files changed, 1219 insertions(+), 1195 deletions(-) delete mode 100644 drivers/net/ethernet/intel/ice/ice_fwlog.c delete mode 100644 drivers/net/ethernet/intel/ice/ice_fwlog.h create mode 100644 drivers/net/ethernet/intel/libie/fwlog.c create mode 100644 include/linux/net/intel/libie/fwlog.h (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index b05cc0d7a15d..09f0af386af1 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -297,6 +297,7 @@ config ICE select DIMLIB select LIBIE select LIBIE_ADMINQ + select LIBIE_FWLOG select NET_DEVLINK select PACKING select PLDMFW diff --git a/drivers/net/ethernet/intel/ice/Makefile b/drivers/net/ethernet/intel/ice/Makefile index eac45d7c0cf1..5b2c666496e7 100644 --- a/drivers/net/ethernet/intel/ice/Makefile +++ b/drivers/net/ethernet/intel/ice/Makefile @@ -42,7 +42,6 @@ ice-y := ice_main.o \ ice_ethtool.o \ ice_repr.o \ ice_tc_lib.o \ - ice_fwlog.o \ ice_debugfs.o \ ice_adapter.o ice-$(CONFIG_PCI_IOV) += \ diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.c b/drivers/net/ethernet/intel/ice/ice_fwlog.c deleted file mode 100644 index 8e7086191030..000000000000 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.c +++ /dev/null @@ -1,1106 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2022, Intel Corporation. */ - -#include -#include -#include -#include -#include "ice.h" -#include "ice_common.h" -#include "ice_fwlog.h" - -/* create a define that has an extra module that doesn't really exist. this - * is so we can add a module 'all' to easily enable/disable all the modules - */ -#define LIBIE_NR_FW_LOG_MODULES (LIBIE_AQC_FW_LOG_ID_MAX + 1) - -/* the ordering in this array is important. it matches the ordering of the - * values in the FW so the index is the same value as in - * libie_aqc_fw_logging_mod - */ -static const char * const libie_fwlog_module_string[] = { - "general", - "ctrl", - "link", - "link_topo", - "dnl", - "i2c", - "sdp", - "mdio", - "adminq", - "hdma", - "lldp", - "dcbx", - "dcb", - "xlr", - "nvm", - "auth", - "vpd", - "iosf", - "parser", - "sw", - "scheduler", - "txq", - "rsvd", - "post", - "watchdog", - "task_dispatch", - "mng", - "synce", - "health", - "tsdrv", - "pfreg", - "mdlver", - "all", -}; - -/* the ordering in this array is important. it matches the ordering of the - * values in the FW so the index is the same value as in libie_fwlog_level - */ -static const char * const libie_fwlog_level_string[] = { - "none", - "error", - "warning", - "normal", - "verbose", -}; - -static const char * const libie_fwlog_log_size[] = { - "128K", - "256K", - "512K", - "1M", - "2M", -}; - -static bool libie_fwlog_ring_empty(struct libie_fwlog_ring *rings) -{ - return rings->head == rings->tail; -} - -static void libie_fwlog_ring_increment(u16 *item, u16 size) -{ - *item = (*item + 1) & (size - 1); -} - -static int libie_fwlog_alloc_ring_buffs(struct libie_fwlog_ring *rings) -{ - int i, nr_bytes; - u8 *mem; - - nr_bytes = rings->size * LIBIE_AQ_MAX_BUF_LEN; - mem = vzalloc(nr_bytes); - if (!mem) - return -ENOMEM; - - for (i = 0; i < rings->size; i++) { - struct libie_fwlog_data *ring = &rings->rings[i]; - - ring->data_size = LIBIE_AQ_MAX_BUF_LEN; - ring->data = mem; - mem += LIBIE_AQ_MAX_BUF_LEN; - } - - return 0; -} - -static void libie_fwlog_free_ring_buffs(struct libie_fwlog_ring *rings) -{ - int i; - - for (i = 0; i < rings->size; i++) { - struct libie_fwlog_data *ring = &rings->rings[i]; - - /* the first ring is the base memory for the whole range so - * free it - */ - if (!i) - vfree(ring->data); - - ring->data = NULL; - ring->data_size = 0; - } -} - -#define LIBIE_FWLOG_INDEX_TO_BYTES(n) ((128 * 1024) << (n)) -/** - * libie_fwlog_realloc_rings - reallocate the FW log rings - * @fwlog: pointer to the fwlog structure - * @index: the new index to use to allocate memory for the log data - * - */ -static void libie_fwlog_realloc_rings(struct libie_fwlog *fwlog, int index) -{ - struct libie_fwlog_ring ring; - int status, ring_size; - - /* convert the number of bytes into a number of 4K buffers. externally - * the driver presents the interface to the FW log data as a number of - * bytes because that's easy for users to understand. internally the - * driver uses a ring of buffers because the driver doesn't know where - * the beginning and end of any line of log data is so the driver has - * to overwrite data as complete blocks. when the data is returned to - * the user the driver knows that the data is correct and the FW log - * can be correctly parsed by the tools - */ - ring_size = LIBIE_FWLOG_INDEX_TO_BYTES(index) / LIBIE_AQ_MAX_BUF_LEN; - if (ring_size == fwlog->ring.size) - return; - - /* allocate space for the new rings and buffers then release the - * old rings and buffers. that way if we don't have enough - * memory then we at least have what we had before - */ - ring.rings = kcalloc(ring_size, sizeof(*ring.rings), GFP_KERNEL); - if (!ring.rings) - return; - - ring.size = ring_size; - - status = libie_fwlog_alloc_ring_buffs(&ring); - if (status) { - dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); - libie_fwlog_free_ring_buffs(&ring); - kfree(ring.rings); - return; - } - - libie_fwlog_free_ring_buffs(&fwlog->ring); - kfree(fwlog->ring.rings); - - fwlog->ring.rings = ring.rings; - fwlog->ring.size = ring.size; - fwlog->ring.index = index; - fwlog->ring.head = 0; - fwlog->ring.tail = 0; -} - -/** - * libie_fwlog_supported - Cached for whether FW supports FW logging or not - * @fwlog: pointer to the fwlog structure - * - * This will always return false if called before libie_init_hw(), so it must be - * called after libie_init_hw(). - */ -static bool libie_fwlog_supported(struct libie_fwlog *fwlog) -{ - return fwlog->supported; -} - -/** - * libie_aq_fwlog_set - Set FW logging configuration AQ command (0xFF30) - * @fwlog: pointer to the fwlog structure - * @entries: entries to configure - * @num_entries: number of @entries - * @options: options from libie_fwlog_cfg->options structure - * @log_resolution: logging resolution - */ -static int -libie_aq_fwlog_set(struct libie_fwlog *fwlog, - struct libie_fwlog_module_entry *entries, u16 num_entries, - u16 options, u16 log_resolution) -{ - struct libie_aqc_fw_log_cfg_resp *fw_modules; - struct libie_aq_desc desc = {0}; - struct libie_aqc_fw_log *cmd; - int status; - int i; - - fw_modules = kcalloc(num_entries, sizeof(*fw_modules), GFP_KERNEL); - if (!fw_modules) - return -ENOMEM; - - for (i = 0; i < num_entries; i++) { - fw_modules[i].module_identifier = - cpu_to_le16(entries[i].module_id); - fw_modules[i].log_level = entries[i].log_level; - } - - desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_config); - desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI) | - cpu_to_le16(LIBIE_AQ_FLAG_RD); - - cmd = libie_aq_raw(&desc); - - cmd->cmd_flags = LIBIE_AQC_FW_LOG_CONF_SET_VALID; - cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution); - cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries); - - if (options & LIBIE_FWLOG_OPTION_ARQ_ENA) - cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_AQ_EN; - if (options & LIBIE_FWLOG_OPTION_UART_ENA) - cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_UART_EN; - - status = fwlog->send_cmd(fwlog->priv, &desc, fw_modules, - sizeof(*fw_modules) * num_entries); - - kfree(fw_modules); - - return status; -} - -/** - * libie_fwlog_set - Set the firmware logging settings - * @fwlog: pointer to the fwlog structure - * @cfg: config used to set firmware logging - * - * This function should be called whenever the driver needs to set the firmware - * logging configuration. It can be called on initialization, reset, or during - * runtime. - * - * If the PF wishes to receive FW logging then it must register via - * libie_fwlog_register. Note, that libie_fwlog_register does not need to be called - * for init. - */ -static int libie_fwlog_set(struct libie_fwlog *fwlog, - struct libie_fwlog_cfg *cfg) -{ - if (!libie_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - return libie_aq_fwlog_set(fwlog, cfg->module_entries, - LIBIE_AQC_FW_LOG_ID_MAX, cfg->options, - cfg->log_resolution); -} - -/** - * libie_aq_fwlog_register - Register PF for firmware logging events (0xFF31) - * @fwlog: pointer to the fwlog structure - * @reg: true to register and false to unregister - */ -static int libie_aq_fwlog_register(struct libie_fwlog *fwlog, bool reg) -{ - struct libie_aq_desc desc = {0}; - struct libie_aqc_fw_log *cmd; - - desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_register); - desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); - cmd = libie_aq_raw(&desc); - - if (reg) - cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_REGISTER; - - return fwlog->send_cmd(fwlog->priv, &desc, NULL, 0); -} - -/** - * libie_fwlog_register - Register the PF for firmware logging - * @fwlog: pointer to the fwlog structure - * - * After this call the PF will start to receive firmware logging based on the - * configuration set in libie_fwlog_set. - */ -static int libie_fwlog_register(struct libie_fwlog *fwlog) -{ - int status; - - if (!libie_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - status = libie_aq_fwlog_register(fwlog, true); - if (status) - dev_dbg(&fwlog->pdev->dev, "Failed to register for firmware logging events over ARQ\n"); - else - fwlog->cfg.options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; - - return status; -} - -/** - * libie_fwlog_unregister - Unregister the PF from firmware logging - * @fwlog: pointer to the fwlog structure - */ -static int libie_fwlog_unregister(struct libie_fwlog *fwlog) -{ - int status; - - if (!libie_fwlog_supported(fwlog)) - return -EOPNOTSUPP; - - status = libie_aq_fwlog_register(fwlog, false); - if (status) - dev_dbg(&fwlog->pdev->dev, "Failed to unregister from firmware logging events over ARQ\n"); - else - fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_IS_REGISTERED; - - return status; -} - -/** - * libie_fwlog_print_module_cfg - print current FW logging module configuration - * @cfg: pointer to the fwlog cfg structure - * @module: module to print - * @s: the seq file to put data into - */ -static void -libie_fwlog_print_module_cfg(struct libie_fwlog_cfg *cfg, int module, - struct seq_file *s) -{ - struct libie_fwlog_module_entry *entry; - - if (module != LIBIE_AQC_FW_LOG_ID_MAX) { - entry = &cfg->module_entries[module]; - - seq_printf(s, "\tModule: %s, Log Level: %s\n", - libie_fwlog_module_string[entry->module_id], - libie_fwlog_level_string[entry->log_level]); - } else { - int i; - - for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) { - entry = &cfg->module_entries[i]; - - seq_printf(s, "\tModule: %s, Log Level: %s\n", - libie_fwlog_module_string[entry->module_id], - libie_fwlog_level_string[entry->log_level]); - } - } -} - -static int libie_find_module_by_dentry(struct dentry **modules, struct dentry *d) -{ - int i, module; - - module = -1; - /* find the module based on the dentry */ - for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { - if (d == modules[i]) { - module = i; - break; - } - } - - return module; -} - -/** - * libie_debugfs_module_show - read from 'module' file - * @s: the opened file - * @v: pointer to the offset - */ -static int libie_debugfs_module_show(struct seq_file *s, void *v) -{ - struct libie_fwlog *fwlog = s->private; - const struct file *filp = s->file; - struct dentry *dentry; - int module; - - dentry = file_dentry(filp); - - module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); - if (module < 0) { - dev_info(&fwlog->pdev->dev, "unknown module\n"); - return -EINVAL; - } - - libie_fwlog_print_module_cfg(&fwlog->cfg, module, s); - - return 0; -} - -static int libie_debugfs_module_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, libie_debugfs_module_show, inode->i_private); -} - -/** - * libie_debugfs_module_write - write into 'module' file - * @filp: the opened file - * @buf: where to find the user's data - * @count: the length of the user's data - * @ppos: file position offset - */ -static ssize_t -libie_debugfs_module_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct libie_fwlog *fwlog = file_inode(filp)->i_private; - struct dentry *dentry = file_dentry(filp); - struct device *dev = &fwlog->pdev->dev; - char user_val[16], *cmd_buf; - int module, log_level, cnt; - - /* don't allow partial writes or invalid input */ - if (*ppos != 0 || count > 8) - return -EINVAL; - - cmd_buf = memdup_user_nul(buf, count); - if (IS_ERR(cmd_buf)) - return PTR_ERR(cmd_buf); - - module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); - if (module < 0) { - dev_info(dev, "unknown module\n"); - return -EINVAL; - } - - cnt = sscanf(cmd_buf, "%s", user_val); - if (cnt != 1) - return -EINVAL; - - log_level = sysfs_match_string(libie_fwlog_level_string, user_val); - if (log_level < 0) { - dev_info(dev, "unknown log level '%s'\n", user_val); - return -EINVAL; - } - - if (module != LIBIE_AQC_FW_LOG_ID_MAX) { - fwlog->cfg.module_entries[module].log_level = log_level; - } else { - /* the module 'all' is a shortcut so that we can set - * all of the modules to the same level quickly - */ - int i; - - for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) - fwlog->cfg.module_entries[i].log_level = log_level; - } - - return count; -} - -static const struct file_operations libie_debugfs_module_fops = { - .owner = THIS_MODULE, - .open = libie_debugfs_module_open, - .read = seq_read, - .release = single_release, - .write = libie_debugfs_module_write, -}; - -/** - * libie_debugfs_nr_messages_read - read from 'nr_messages' file - * @filp: the opened file - * @buffer: where to write the data for the user to read - * @count: the size of the user's buffer - * @ppos: file position offset - */ -static ssize_t libie_debugfs_nr_messages_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - char buff[32] = {}; - - snprintf(buff, sizeof(buff), "%d\n", - fwlog->cfg.log_resolution); - - return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); -} - -/** - * libie_debugfs_nr_messages_write - write into 'nr_messages' file - * @filp: the opened file - * @buf: where to find the user's data - * @count: the length of the user's data - * @ppos: file position offset - */ -static ssize_t -libie_debugfs_nr_messages_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - struct device *dev = &fwlog->pdev->dev; - char user_val[8], *cmd_buf; - s16 nr_messages; - ssize_t ret; - - /* don't allow partial writes or invalid input */ - if (*ppos != 0 || count > 4) - return -EINVAL; - - cmd_buf = memdup_user_nul(buf, count); - if (IS_ERR(cmd_buf)) - return PTR_ERR(cmd_buf); - - ret = sscanf(cmd_buf, "%s", user_val); - if (ret != 1) - return -EINVAL; - - ret = kstrtos16(user_val, 0, &nr_messages); - if (ret) - return ret; - - if (nr_messages < LIBIE_AQC_FW_LOG_MIN_RESOLUTION || - nr_messages > LIBIE_AQC_FW_LOG_MAX_RESOLUTION) { - dev_err(dev, "Invalid FW log number of messages %d, value must be between %d - %d\n", - nr_messages, LIBIE_AQC_FW_LOG_MIN_RESOLUTION, - LIBIE_AQC_FW_LOG_MAX_RESOLUTION); - return -EINVAL; - } - - fwlog->cfg.log_resolution = nr_messages; - - return count; -} - -static const struct file_operations libie_debugfs_nr_messages_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = libie_debugfs_nr_messages_read, - .write = libie_debugfs_nr_messages_write, -}; - -/** - * libie_debugfs_enable_read - read from 'enable' file - * @filp: the opened file - * @buffer: where to write the data for the user to read - * @count: the size of the user's buffer - * @ppos: file position offset - */ -static ssize_t libie_debugfs_enable_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - char buff[32] = {}; - - snprintf(buff, sizeof(buff), "%u\n", - (u16)(fwlog->cfg.options & - LIBIE_FWLOG_OPTION_IS_REGISTERED) >> 3); - - return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); -} - -/** - * libie_debugfs_enable_write - write into 'enable' file - * @filp: the opened file - * @buf: where to find the user's data - * @count: the length of the user's data - * @ppos: file position offset - */ -static ssize_t -libie_debugfs_enable_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - char user_val[8], *cmd_buf; - bool enable; - ssize_t ret; - - /* don't allow partial writes or invalid input */ - if (*ppos != 0 || count > 2) - return -EINVAL; - - cmd_buf = memdup_user_nul(buf, count); - if (IS_ERR(cmd_buf)) - return PTR_ERR(cmd_buf); - - ret = sscanf(cmd_buf, "%s", user_val); - if (ret != 1) - return -EINVAL; - - ret = kstrtobool(user_val, &enable); - if (ret) - goto enable_write_error; - - if (enable) - fwlog->cfg.options |= LIBIE_FWLOG_OPTION_ARQ_ENA; - else - fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; - - ret = libie_fwlog_set(fwlog, &fwlog->cfg); - if (ret) - goto enable_write_error; - - if (enable) - ret = libie_fwlog_register(fwlog); - else - ret = libie_fwlog_unregister(fwlog); - - if (ret) - goto enable_write_error; - - /* if we get here, nothing went wrong; return count since we didn't - * really write anything - */ - ret = (ssize_t)count; - -enable_write_error: - /* This function always consumes all of the written input, or produces - * an error. Check and enforce this. Otherwise, the write operation - * won't complete properly. - */ - if (WARN_ON(ret != (ssize_t)count && ret >= 0)) - ret = -EIO; - - return ret; -} - -static const struct file_operations libie_debugfs_enable_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = libie_debugfs_enable_read, - .write = libie_debugfs_enable_write, -}; - -/** - * libie_debugfs_log_size_read - read from 'log_size' file - * @filp: the opened file - * @buffer: where to write the data for the user to read - * @count: the size of the user's buffer - * @ppos: file position offset - */ -static ssize_t libie_debugfs_log_size_read(struct file *filp, - char __user *buffer, size_t count, - loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - char buff[32] = {}; - int index; - - index = fwlog->ring.index; - snprintf(buff, sizeof(buff), "%s\n", libie_fwlog_log_size[index]); - - return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); -} - -/** - * libie_debugfs_log_size_write - write into 'log_size' file - * @filp: the opened file - * @buf: where to find the user's data - * @count: the length of the user's data - * @ppos: file position offset - */ -static ssize_t -libie_debugfs_log_size_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - struct device *dev = &fwlog->pdev->dev; - char user_val[8], *cmd_buf; - ssize_t ret; - int index; - - /* don't allow partial writes or invalid input */ - if (*ppos != 0 || count > 5) - return -EINVAL; - - cmd_buf = memdup_user_nul(buf, count); - if (IS_ERR(cmd_buf)) - return PTR_ERR(cmd_buf); - - ret = sscanf(cmd_buf, "%s", user_val); - if (ret != 1) - return -EINVAL; - - index = sysfs_match_string(libie_fwlog_log_size, user_val); - if (index < 0) { - dev_info(dev, "Invalid log size '%s'. The value must be one of 128K, 256K, 512K, 1M, 2M\n", - user_val); - ret = -EINVAL; - goto log_size_write_error; - } else if (fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED) { - dev_info(dev, "FW logging is currently running. Please disable FW logging to change log_size\n"); - ret = -EINVAL; - goto log_size_write_error; - } - - /* free all the buffers and the tracking info and resize */ - libie_fwlog_realloc_rings(fwlog, index); - - /* if we get here, nothing went wrong; return count since we didn't - * really write anything - */ - ret = (ssize_t)count; - -log_size_write_error: - /* This function always consumes all of the written input, or produces - * an error. Check and enforce this. Otherwise, the write operation - * won't complete properly. - */ - if (WARN_ON(ret != (ssize_t)count && ret >= 0)) - ret = -EIO; - - return ret; -} - -static const struct file_operations libie_debugfs_log_size_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = libie_debugfs_log_size_read, - .write = libie_debugfs_log_size_write, -}; - -/** - * libie_debugfs_data_read - read from 'data' file - * @filp: the opened file - * @buffer: where to write the data for the user to read - * @count: the size of the user's buffer - * @ppos: file position offset - */ -static ssize_t libie_debugfs_data_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - int data_copied = 0; - bool done = false; - - if (libie_fwlog_ring_empty(&fwlog->ring)) - return 0; - - while (!libie_fwlog_ring_empty(&fwlog->ring) && !done) { - struct libie_fwlog_data *log; - u16 cur_buf_len; - - log = &fwlog->ring.rings[fwlog->ring.head]; - cur_buf_len = log->data_size; - if (cur_buf_len >= count) { - done = true; - continue; - } - - if (copy_to_user(buffer, log->data, cur_buf_len)) { - /* if there is an error then bail and return whatever - * the driver has copied so far - */ - done = true; - continue; - } - - data_copied += cur_buf_len; - buffer += cur_buf_len; - count -= cur_buf_len; - *ppos += cur_buf_len; - libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); - } - - return data_copied; -} - -/** - * libie_debugfs_data_write - write into 'data' file - * @filp: the opened file - * @buf: where to find the user's data - * @count: the length of the user's data - * @ppos: file position offset - */ -static ssize_t -libie_debugfs_data_write(struct file *filp, const char __user *buf, size_t count, - loff_t *ppos) -{ - struct libie_fwlog *fwlog = filp->private_data; - struct device *dev = &fwlog->pdev->dev; - ssize_t ret; - - /* don't allow partial writes */ - if (*ppos != 0) - return 0; - - /* any value is allowed to clear the buffer so no need to even look at - * what the value is - */ - if (!(fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED)) { - fwlog->ring.head = 0; - fwlog->ring.tail = 0; - } else { - dev_info(dev, "Can't clear FW log data while FW log running\n"); - ret = -EINVAL; - goto nr_buffs_write_error; - } - - /* if we get here, nothing went wrong; return count since we didn't - * really write anything - */ - ret = (ssize_t)count; - -nr_buffs_write_error: - /* This function always consumes all of the written input, or produces - * an error. Check and enforce this. Otherwise, the write operation - * won't complete properly. - */ - if (WARN_ON(ret != (ssize_t)count && ret >= 0)) - ret = -EIO; - - return ret; -} - -static const struct file_operations libie_debugfs_data_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = libie_debugfs_data_read, - .write = libie_debugfs_data_write, -}; - -/** - * libie_debugfs_fwlog_init - setup the debugfs directory - * @fwlog: pointer to the fwlog structure - * @root: debugfs root entry on which fwlog director will be registered - */ -static void libie_debugfs_fwlog_init(struct libie_fwlog *fwlog, - struct dentry *root) -{ - struct dentry *fw_modules_dir; - struct dentry **fw_modules; - int i; - - /* allocate space for this first because if it fails then we don't - * need to unwind - */ - fw_modules = kcalloc(LIBIE_NR_FW_LOG_MODULES, sizeof(*fw_modules), - GFP_KERNEL); - if (!fw_modules) - return; - - fwlog->debugfs = debugfs_create_dir("fwlog", root); - if (IS_ERR(fwlog->debugfs)) - goto err_create_module_files; - - fw_modules_dir = debugfs_create_dir("modules", fwlog->debugfs); - if (IS_ERR(fw_modules_dir)) - goto err_create_module_files; - - for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { - fw_modules[i] = debugfs_create_file(libie_fwlog_module_string[i], - 0600, fw_modules_dir, fwlog, - &libie_debugfs_module_fops); - if (IS_ERR(fw_modules[i])) - goto err_create_module_files; - } - - debugfs_create_file("nr_messages", 0600, fwlog->debugfs, fwlog, - &libie_debugfs_nr_messages_fops); - - fwlog->debugfs_modules = fw_modules; - - debugfs_create_file("enable", 0600, fwlog->debugfs, fwlog, - &libie_debugfs_enable_fops); - - debugfs_create_file("log_size", 0600, fwlog->debugfs, fwlog, - &libie_debugfs_log_size_fops); - - debugfs_create_file("data", 0600, fwlog->debugfs, fwlog, - &libie_debugfs_data_fops); - - return; - -err_create_module_files: - debugfs_remove_recursive(fwlog->debugfs); - kfree(fw_modules); -} - -static bool libie_fwlog_ring_full(struct libie_fwlog_ring *rings) -{ - u16 head, tail; - - head = rings->head; - tail = rings->tail; - - if (head < tail && (tail - head == (rings->size - 1))) - return true; - else if (head > tail && (tail == (head - 1))) - return true; - - return false; -} - -/** - * libie_aq_fwlog_get - Get the current firmware logging configuration (0xFF32) - * @fwlog: pointer to the fwlog structure - * @cfg: firmware logging configuration to populate - */ -static int libie_aq_fwlog_get(struct libie_fwlog *fwlog, - struct libie_fwlog_cfg *cfg) -{ - struct libie_aqc_fw_log_cfg_resp *fw_modules; - struct libie_aq_desc desc = {0}; - struct libie_aqc_fw_log *cmd; - u16 module_id_cnt; - int status; - void *buf; - int i; - - memset(cfg, 0, sizeof(*cfg)); - - buf = kzalloc(LIBIE_AQ_MAX_BUF_LEN, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_query); - desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); - cmd = libie_aq_raw(&desc); - - cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_QUERY; - - status = fwlog->send_cmd(fwlog->priv, &desc, buf, LIBIE_AQ_MAX_BUF_LEN); - if (status) { - dev_dbg(&fwlog->pdev->dev, "Failed to get FW log configuration\n"); - goto status_out; - } - - module_id_cnt = le16_to_cpu(cmd->ops.cfg.mdl_cnt); - if (module_id_cnt < LIBIE_AQC_FW_LOG_ID_MAX) { - dev_dbg(&fwlog->pdev->dev, "FW returned less than the expected number of FW log module IDs\n"); - } else if (module_id_cnt > LIBIE_AQC_FW_LOG_ID_MAX) { - dev_dbg(&fwlog->pdev->dev, "FW returned more than expected number of FW log module IDs, setting module_id_cnt to software expected max %u\n", - LIBIE_AQC_FW_LOG_ID_MAX); - module_id_cnt = LIBIE_AQC_FW_LOG_ID_MAX; - } - - cfg->log_resolution = le16_to_cpu(cmd->ops.cfg.log_resolution); - if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_AQ_EN) - cfg->options |= LIBIE_FWLOG_OPTION_ARQ_ENA; - if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_UART_EN) - cfg->options |= LIBIE_FWLOG_OPTION_UART_ENA; - if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_QUERY_REGISTERED) - cfg->options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; - - fw_modules = (struct libie_aqc_fw_log_cfg_resp *)buf; - - for (i = 0; i < module_id_cnt; i++) { - struct libie_aqc_fw_log_cfg_resp *fw_module = &fw_modules[i]; - - cfg->module_entries[i].module_id = - le16_to_cpu(fw_module->module_identifier); - cfg->module_entries[i].log_level = fw_module->log_level; - } - -status_out: - kfree(buf); - return status; -} - -/** - * libie_fwlog_set_supported - Set if FW logging is supported by FW - * @fwlog: pointer to the fwlog structure - * - * If FW returns success to the libie_aq_fwlog_get call then it supports FW - * logging, else it doesn't. Set the fwlog_supported flag accordingly. - * - * This function is only meant to be called during driver init to determine if - * the FW support FW logging. - */ -static void libie_fwlog_set_supported(struct libie_fwlog *fwlog) -{ - struct libie_fwlog_cfg *cfg; - int status; - - fwlog->supported = false; - - cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); - if (!cfg) - return; - - status = libie_aq_fwlog_get(fwlog, cfg); - if (status) - dev_dbg(&fwlog->pdev->dev, "libie_aq_fwlog_get failed, FW logging is not supported on this version of FW, status %d\n", - status); - else - fwlog->supported = true; - - kfree(cfg); -} - -/** - * libie_fwlog_init - Initialize FW logging configuration - * @fwlog: pointer to the fwlog structure - * @api: api structure to init fwlog - * - * This function should be called on driver initialization during - * libie_init_hw(). - */ -int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api) -{ - fwlog->api = *api; - libie_fwlog_set_supported(fwlog); - - if (libie_fwlog_supported(fwlog)) { - int status; - - /* read the current config from the FW and store it */ - status = libie_aq_fwlog_get(fwlog, &fwlog->cfg); - if (status) - return status; - - fwlog->ring.rings = kcalloc(LIBIE_FWLOG_RING_SIZE_DFLT, - sizeof(*fwlog->ring.rings), - GFP_KERNEL); - if (!fwlog->ring.rings) { - dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log rings\n"); - return -ENOMEM; - } - - fwlog->ring.size = LIBIE_FWLOG_RING_SIZE_DFLT; - fwlog->ring.index = LIBIE_FWLOG_RING_SIZE_INDEX_DFLT; - - status = libie_fwlog_alloc_ring_buffs(&fwlog->ring); - if (status) { - dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); - libie_fwlog_free_ring_buffs(&fwlog->ring); - kfree(fwlog->ring.rings); - return status; - } - - libie_debugfs_fwlog_init(fwlog, api->debugfs_root); - } else { - dev_warn(&fwlog->pdev->dev, "FW logging is not supported in this NVM image. Please update the NVM to get FW log support\n"); - } - - return 0; -} - -/** - * libie_fwlog_deinit - unroll FW logging configuration - * @fwlog: pointer to the fwlog structure - * - * This function should be called in libie_deinit_hw(). - */ -void libie_fwlog_deinit(struct libie_fwlog *fwlog) -{ - int status; - - /* make sure FW logging is disabled to not put the FW in a weird state - * for the next driver load - */ - fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; - status = libie_fwlog_set(fwlog, &fwlog->cfg); - if (status) - dev_warn(&fwlog->pdev->dev, "Unable to turn off FW logging, status: %d\n", - status); - - kfree(fwlog->debugfs_modules); - - fwlog->debugfs_modules = NULL; - - status = libie_fwlog_unregister(fwlog); - if (status) - dev_warn(&fwlog->pdev->dev, "Unable to unregister FW logging, status: %d\n", - status); - - if (fwlog->ring.rings) { - libie_fwlog_free_ring_buffs(&fwlog->ring); - kfree(fwlog->ring.rings); - } -} - -/** - * libie_get_fwlog_data - copy the FW log data from ARQ event - * @fwlog: fwlog that the FW log event is associated with - * @buf: event buffer pointer - * @len: len of event descriptor - */ -void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len) -{ - struct libie_fwlog_data *log; - - log = &fwlog->ring.rings[fwlog->ring.tail]; - - memset(log->data, 0, PAGE_SIZE); - log->data_size = len; - - memcpy(log->data, buf, log->data_size); - libie_fwlog_ring_increment(&fwlog->ring.tail, fwlog->ring.size); - - if (libie_fwlog_ring_full(&fwlog->ring)) { - /* the rings are full so bump the head to create room */ - libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); - } -} - -void libie_fwlog_reregister(struct libie_fwlog *fwlog) -{ - if (!(fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED)) - return; - - if (libie_fwlog_register(fwlog)) - fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_IS_REGISTERED; -} diff --git a/drivers/net/ethernet/intel/ice/ice_fwlog.h b/drivers/net/ethernet/intel/ice/ice_fwlog.h deleted file mode 100644 index e534205a2d04..000000000000 --- a/drivers/net/ethernet/intel/ice/ice_fwlog.h +++ /dev/null @@ -1,84 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2022, Intel Corporation. */ - -#ifndef _LIBIE_FWLOG_H_ -#define _LIBIE_FWLOG_H_ -#include "ice_adminq_cmd.h" - -/* Only a single log level should be set and all log levels under the set value - * are enabled, e.g. if log level is set to LIBIE_FW_LOG_LEVEL_VERBOSE, then all - * other log levels are included (except LIBIE_FW_LOG_LEVEL_NONE) - */ -enum libie_fwlog_level { - LIBIE_FWLOG_LEVEL_NONE = 0, - LIBIE_FWLOG_LEVEL_ERROR = 1, - LIBIE_FWLOG_LEVEL_WARNING = 2, - LIBIE_FWLOG_LEVEL_NORMAL = 3, - LIBIE_FWLOG_LEVEL_VERBOSE = 4, - LIBIE_FWLOG_LEVEL_INVALID, /* all values >= this entry are invalid */ -}; - -struct libie_fwlog_module_entry { - /* module ID for the corresponding firmware logging event */ - u16 module_id; - /* verbosity level for the module_id */ - u8 log_level; -}; - -struct libie_fwlog_cfg { - /* list of modules for configuring log level */ - struct libie_fwlog_module_entry module_entries[LIBIE_AQC_FW_LOG_ID_MAX]; - /* options used to configure firmware logging */ - u16 options; -#define LIBIE_FWLOG_OPTION_ARQ_ENA BIT(0) -#define LIBIE_FWLOG_OPTION_UART_ENA BIT(1) - /* set before calling libie_fwlog_init() so the PF registers for - * firmware logging on initialization - */ -#define LIBIE_FWLOG_OPTION_REGISTER_ON_INIT BIT(2) - /* set in the libie_aq_fwlog_get() response if the PF is registered for - * FW logging events over ARQ - */ -#define LIBIE_FWLOG_OPTION_IS_REGISTERED BIT(3) - - /* minimum number of log events sent per Admin Receive Queue event */ - u16 log_resolution; -}; - -struct libie_fwlog_data { - u16 data_size; - u8 *data; -}; - -struct libie_fwlog_ring { - struct libie_fwlog_data *rings; - u16 index; - u16 size; - u16 head; - u16 tail; -}; - -#define LIBIE_FWLOG_RING_SIZE_INDEX_DFLT 3 -#define LIBIE_FWLOG_RING_SIZE_DFLT 256 -#define LIBIE_FWLOG_RING_SIZE_MAX 512 - -struct libie_fwlog { - struct libie_fwlog_cfg cfg; - bool supported; /* does hardware support FW logging? */ - struct libie_fwlog_ring ring; - struct dentry *debugfs; - /* keep track of all the dentrys for FW log modules */ - struct dentry **debugfs_modules; - struct_group_tagged(libie_fwlog_api, api, - struct pci_dev *pdev; - int (*send_cmd)(void *, struct libie_aq_desc *, void *, u16); - void *priv; - struct dentry *debugfs_root; - ); -}; - -int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api); -void libie_fwlog_deinit(struct libie_fwlog *fwlog); -void libie_fwlog_reregister(struct libie_fwlog *fwlog); -void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len); -#endif /* _LIBIE_FWLOG_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 44e184c6c416..f0d3aee24a93 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -39,6 +39,7 @@ static const char ice_copyright[] = "Copyright (c) 2018, Intel Corporation."; MODULE_DESCRIPTION(DRV_SUMMARY); MODULE_IMPORT_NS("LIBIE"); MODULE_IMPORT_NS("LIBIE_ADMINQ"); +MODULE_IMPORT_NS("LIBIE_FWLOG"); MODULE_LICENSE("GPL v2"); MODULE_FIRMWARE(ICE_DDP_PKG_FILE); diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 14296bccc4c9..b0a1b67071c5 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -17,7 +17,7 @@ #include "ice_protocol_type.h" #include "ice_sbq_cmd.h" #include "ice_vlan_mode.h" -#include "ice_fwlog.h" +#include #include #include diff --git a/drivers/net/ethernet/intel/libie/Kconfig b/drivers/net/ethernet/intel/libie/Kconfig index e6072758e3d8..70831c7e336e 100644 --- a/drivers/net/ethernet/intel/libie/Kconfig +++ b/drivers/net/ethernet/intel/libie/Kconfig @@ -14,3 +14,12 @@ config LIBIE_ADMINQ help Helper functions used by Intel Ethernet drivers for administration queue command interface (aka adminq). + +config LIBIE_FWLOG + tristate + select LIBIE_ADMINQ + help + Library to support firmware logging on device that have support + for it. Firmware logging is using admin queue interface to communicate + with the device. Debugfs is a user interface used to config logging + and dump all collected logs. diff --git a/drivers/net/ethernet/intel/libie/Makefile b/drivers/net/ethernet/intel/libie/Makefile index e98f00b865d3..db57fc6780ea 100644 --- a/drivers/net/ethernet/intel/libie/Makefile +++ b/drivers/net/ethernet/intel/libie/Makefile @@ -8,3 +8,7 @@ libie-y := rx.o obj-$(CONFIG_LIBIE_ADMINQ) += libie_adminq.o libie_adminq-y := adminq.o + +obj-$(CONFIG_LIBIE_FWLOG) += libie_fwlog.o + +libie_fwlog-y := fwlog.o diff --git a/drivers/net/ethernet/intel/libie/fwlog.c b/drivers/net/ethernet/intel/libie/fwlog.c new file mode 100644 index 000000000000..f39cc11cb7c5 --- /dev/null +++ b/drivers/net/ethernet/intel/libie/fwlog.c @@ -0,0 +1,1115 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2022, Intel Corporation. */ + +#include +#include +#include +#include +#include +#include +#include + +#define DEFAULT_SYMBOL_NAMESPACE "LIBIE_FWLOG" + +/* create a define that has an extra module that doesn't really exist. this + * is so we can add a module 'all' to easily enable/disable all the modules + */ +#define LIBIE_NR_FW_LOG_MODULES (LIBIE_AQC_FW_LOG_ID_MAX + 1) + +/* the ordering in this array is important. it matches the ordering of the + * values in the FW so the index is the same value as in + * libie_aqc_fw_logging_mod + */ +static const char * const libie_fwlog_module_string[] = { + "general", + "ctrl", + "link", + "link_topo", + "dnl", + "i2c", + "sdp", + "mdio", + "adminq", + "hdma", + "lldp", + "dcbx", + "dcb", + "xlr", + "nvm", + "auth", + "vpd", + "iosf", + "parser", + "sw", + "scheduler", + "txq", + "rsvd", + "post", + "watchdog", + "task_dispatch", + "mng", + "synce", + "health", + "tsdrv", + "pfreg", + "mdlver", + "all", +}; + +/* the ordering in this array is important. it matches the ordering of the + * values in the FW so the index is the same value as in libie_fwlog_level + */ +static const char * const libie_fwlog_level_string[] = { + "none", + "error", + "warning", + "normal", + "verbose", +}; + +static const char * const libie_fwlog_log_size[] = { + "128K", + "256K", + "512K", + "1M", + "2M", +}; + +static bool libie_fwlog_ring_empty(struct libie_fwlog_ring *rings) +{ + return rings->head == rings->tail; +} + +static void libie_fwlog_ring_increment(u16 *item, u16 size) +{ + *item = (*item + 1) & (size - 1); +} + +static int libie_fwlog_alloc_ring_buffs(struct libie_fwlog_ring *rings) +{ + int i, nr_bytes; + u8 *mem; + + nr_bytes = rings->size * LIBIE_AQ_MAX_BUF_LEN; + mem = vzalloc(nr_bytes); + if (!mem) + return -ENOMEM; + + for (i = 0; i < rings->size; i++) { + struct libie_fwlog_data *ring = &rings->rings[i]; + + ring->data_size = LIBIE_AQ_MAX_BUF_LEN; + ring->data = mem; + mem += LIBIE_AQ_MAX_BUF_LEN; + } + + return 0; +} + +static void libie_fwlog_free_ring_buffs(struct libie_fwlog_ring *rings) +{ + int i; + + for (i = 0; i < rings->size; i++) { + struct libie_fwlog_data *ring = &rings->rings[i]; + + /* the first ring is the base memory for the whole range so + * free it + */ + if (!i) + vfree(ring->data); + + ring->data = NULL; + ring->data_size = 0; + } +} + +#define LIBIE_FWLOG_INDEX_TO_BYTES(n) ((128 * 1024) << (n)) +/** + * libie_fwlog_realloc_rings - reallocate the FW log rings + * @fwlog: pointer to the fwlog structure + * @index: the new index to use to allocate memory for the log data + * + */ +static void libie_fwlog_realloc_rings(struct libie_fwlog *fwlog, int index) +{ + struct libie_fwlog_ring ring; + int status, ring_size; + + /* convert the number of bytes into a number of 4K buffers. externally + * the driver presents the interface to the FW log data as a number of + * bytes because that's easy for users to understand. internally the + * driver uses a ring of buffers because the driver doesn't know where + * the beginning and end of any line of log data is so the driver has + * to overwrite data as complete blocks. when the data is returned to + * the user the driver knows that the data is correct and the FW log + * can be correctly parsed by the tools + */ + ring_size = LIBIE_FWLOG_INDEX_TO_BYTES(index) / LIBIE_AQ_MAX_BUF_LEN; + if (ring_size == fwlog->ring.size) + return; + + /* allocate space for the new rings and buffers then release the + * old rings and buffers. that way if we don't have enough + * memory then we at least have what we had before + */ + ring.rings = kcalloc(ring_size, sizeof(*ring.rings), GFP_KERNEL); + if (!ring.rings) + return; + + ring.size = ring_size; + + status = libie_fwlog_alloc_ring_buffs(&ring); + if (status) { + dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); + libie_fwlog_free_ring_buffs(&ring); + kfree(ring.rings); + return; + } + + libie_fwlog_free_ring_buffs(&fwlog->ring); + kfree(fwlog->ring.rings); + + fwlog->ring.rings = ring.rings; + fwlog->ring.size = ring.size; + fwlog->ring.index = index; + fwlog->ring.head = 0; + fwlog->ring.tail = 0; +} + +/** + * libie_fwlog_supported - Cached for whether FW supports FW logging or not + * @fwlog: pointer to the fwlog structure + * + * This will always return false if called before libie_init_hw(), so it must be + * called after libie_init_hw(). + */ +static bool libie_fwlog_supported(struct libie_fwlog *fwlog) +{ + return fwlog->supported; +} + +/** + * libie_aq_fwlog_set - Set FW logging configuration AQ command (0xFF30) + * @fwlog: pointer to the fwlog structure + * @entries: entries to configure + * @num_entries: number of @entries + * @options: options from libie_fwlog_cfg->options structure + * @log_resolution: logging resolution + */ +static int +libie_aq_fwlog_set(struct libie_fwlog *fwlog, + struct libie_fwlog_module_entry *entries, u16 num_entries, + u16 options, u16 log_resolution) +{ + struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aq_desc desc = {0}; + struct libie_aqc_fw_log *cmd; + int status; + int i; + + fw_modules = kcalloc(num_entries, sizeof(*fw_modules), GFP_KERNEL); + if (!fw_modules) + return -ENOMEM; + + for (i = 0; i < num_entries; i++) { + fw_modules[i].module_identifier = + cpu_to_le16(entries[i].module_id); + fw_modules[i].log_level = entries[i].log_level; + } + + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_config); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI) | + cpu_to_le16(LIBIE_AQ_FLAG_RD); + + cmd = libie_aq_raw(&desc); + + cmd->cmd_flags = LIBIE_AQC_FW_LOG_CONF_SET_VALID; + cmd->ops.cfg.log_resolution = cpu_to_le16(log_resolution); + cmd->ops.cfg.mdl_cnt = cpu_to_le16(num_entries); + + if (options & LIBIE_FWLOG_OPTION_ARQ_ENA) + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_AQ_EN; + if (options & LIBIE_FWLOG_OPTION_UART_ENA) + cmd->cmd_flags |= LIBIE_AQC_FW_LOG_CONF_UART_EN; + + status = fwlog->send_cmd(fwlog->priv, &desc, fw_modules, + sizeof(*fw_modules) * num_entries); + + kfree(fw_modules); + + return status; +} + +/** + * libie_fwlog_set - Set the firmware logging settings + * @fwlog: pointer to the fwlog structure + * @cfg: config used to set firmware logging + * + * This function should be called whenever the driver needs to set the firmware + * logging configuration. It can be called on initialization, reset, or during + * runtime. + * + * If the PF wishes to receive FW logging then it must register via + * libie_fwlog_register. Note, that libie_fwlog_register does not need to be called + * for init. + */ +static int libie_fwlog_set(struct libie_fwlog *fwlog, + struct libie_fwlog_cfg *cfg) +{ + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + return libie_aq_fwlog_set(fwlog, cfg->module_entries, + LIBIE_AQC_FW_LOG_ID_MAX, cfg->options, + cfg->log_resolution); +} + +/** + * libie_aq_fwlog_register - Register PF for firmware logging events (0xFF31) + * @fwlog: pointer to the fwlog structure + * @reg: true to register and false to unregister + */ +static int libie_aq_fwlog_register(struct libie_fwlog *fwlog, bool reg) +{ + struct libie_aq_desc desc = {0}; + struct libie_aqc_fw_log *cmd; + + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_register); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); + cmd = libie_aq_raw(&desc); + + if (reg) + cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_REGISTER; + + return fwlog->send_cmd(fwlog->priv, &desc, NULL, 0); +} + +/** + * libie_fwlog_register - Register the PF for firmware logging + * @fwlog: pointer to the fwlog structure + * + * After this call the PF will start to receive firmware logging based on the + * configuration set in libie_fwlog_set. + */ +static int libie_fwlog_register(struct libie_fwlog *fwlog) +{ + int status; + + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + status = libie_aq_fwlog_register(fwlog, true); + if (status) + dev_dbg(&fwlog->pdev->dev, "Failed to register for firmware logging events over ARQ\n"); + else + fwlog->cfg.options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; + + return status; +} + +/** + * libie_fwlog_unregister - Unregister the PF from firmware logging + * @fwlog: pointer to the fwlog structure + */ +static int libie_fwlog_unregister(struct libie_fwlog *fwlog) +{ + int status; + + if (!libie_fwlog_supported(fwlog)) + return -EOPNOTSUPP; + + status = libie_aq_fwlog_register(fwlog, false); + if (status) + dev_dbg(&fwlog->pdev->dev, "Failed to unregister from firmware logging events over ARQ\n"); + else + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_IS_REGISTERED; + + return status; +} + +/** + * libie_fwlog_print_module_cfg - print current FW logging module configuration + * @cfg: pointer to the fwlog cfg structure + * @module: module to print + * @s: the seq file to put data into + */ +static void +libie_fwlog_print_module_cfg(struct libie_fwlog_cfg *cfg, int module, + struct seq_file *s) +{ + struct libie_fwlog_module_entry *entry; + + if (module != LIBIE_AQC_FW_LOG_ID_MAX) { + entry = &cfg->module_entries[module]; + + seq_printf(s, "\tModule: %s, Log Level: %s\n", + libie_fwlog_module_string[entry->module_id], + libie_fwlog_level_string[entry->log_level]); + } else { + int i; + + for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) { + entry = &cfg->module_entries[i]; + + seq_printf(s, "\tModule: %s, Log Level: %s\n", + libie_fwlog_module_string[entry->module_id], + libie_fwlog_level_string[entry->log_level]); + } + } +} + +static int libie_find_module_by_dentry(struct dentry **modules, struct dentry *d) +{ + int i, module; + + module = -1; + /* find the module based on the dentry */ + for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { + if (d == modules[i]) { + module = i; + break; + } + } + + return module; +} + +/** + * libie_debugfs_module_show - read from 'module' file + * @s: the opened file + * @v: pointer to the offset + */ +static int libie_debugfs_module_show(struct seq_file *s, void *v) +{ + struct libie_fwlog *fwlog = s->private; + const struct file *filp = s->file; + struct dentry *dentry; + int module; + + dentry = file_dentry(filp); + + module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); + if (module < 0) { + dev_info(&fwlog->pdev->dev, "unknown module\n"); + return -EINVAL; + } + + libie_fwlog_print_module_cfg(&fwlog->cfg, module, s); + + return 0; +} + +static int libie_debugfs_module_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, libie_debugfs_module_show, inode->i_private); +} + +/** + * libie_debugfs_module_write - write into 'module' file + * @filp: the opened file + * @buf: where to find the user's data + * @count: the length of the user's data + * @ppos: file position offset + */ +static ssize_t +libie_debugfs_module_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct libie_fwlog *fwlog = file_inode(filp)->i_private; + struct dentry *dentry = file_dentry(filp); + struct device *dev = &fwlog->pdev->dev; + char user_val[16], *cmd_buf; + int module, log_level, cnt; + + /* don't allow partial writes or invalid input */ + if (*ppos != 0 || count > 8) + return -EINVAL; + + cmd_buf = memdup_user_nul(buf, count); + if (IS_ERR(cmd_buf)) + return PTR_ERR(cmd_buf); + + module = libie_find_module_by_dentry(fwlog->debugfs_modules, dentry); + if (module < 0) { + dev_info(dev, "unknown module\n"); + return -EINVAL; + } + + cnt = sscanf(cmd_buf, "%s", user_val); + if (cnt != 1) + return -EINVAL; + + log_level = sysfs_match_string(libie_fwlog_level_string, user_val); + if (log_level < 0) { + dev_info(dev, "unknown log level '%s'\n", user_val); + return -EINVAL; + } + + if (module != LIBIE_AQC_FW_LOG_ID_MAX) { + fwlog->cfg.module_entries[module].log_level = log_level; + } else { + /* the module 'all' is a shortcut so that we can set + * all of the modules to the same level quickly + */ + int i; + + for (i = 0; i < LIBIE_AQC_FW_LOG_ID_MAX; i++) + fwlog->cfg.module_entries[i].log_level = log_level; + } + + return count; +} + +static const struct file_operations libie_debugfs_module_fops = { + .owner = THIS_MODULE, + .open = libie_debugfs_module_open, + .read = seq_read, + .release = single_release, + .write = libie_debugfs_module_write, +}; + +/** + * libie_debugfs_nr_messages_read - read from 'nr_messages' file + * @filp: the opened file + * @buffer: where to write the data for the user to read + * @count: the size of the user's buffer + * @ppos: file position offset + */ +static ssize_t libie_debugfs_nr_messages_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + char buff[32] = {}; + + snprintf(buff, sizeof(buff), "%d\n", + fwlog->cfg.log_resolution); + + return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); +} + +/** + * libie_debugfs_nr_messages_write - write into 'nr_messages' file + * @filp: the opened file + * @buf: where to find the user's data + * @count: the length of the user's data + * @ppos: file position offset + */ +static ssize_t +libie_debugfs_nr_messages_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + struct device *dev = &fwlog->pdev->dev; + char user_val[8], *cmd_buf; + s16 nr_messages; + ssize_t ret; + + /* don't allow partial writes or invalid input */ + if (*ppos != 0 || count > 4) + return -EINVAL; + + cmd_buf = memdup_user_nul(buf, count); + if (IS_ERR(cmd_buf)) + return PTR_ERR(cmd_buf); + + ret = sscanf(cmd_buf, "%s", user_val); + if (ret != 1) + return -EINVAL; + + ret = kstrtos16(user_val, 0, &nr_messages); + if (ret) + return ret; + + if (nr_messages < LIBIE_AQC_FW_LOG_MIN_RESOLUTION || + nr_messages > LIBIE_AQC_FW_LOG_MAX_RESOLUTION) { + dev_err(dev, "Invalid FW log number of messages %d, value must be between %d - %d\n", + nr_messages, LIBIE_AQC_FW_LOG_MIN_RESOLUTION, + LIBIE_AQC_FW_LOG_MAX_RESOLUTION); + return -EINVAL; + } + + fwlog->cfg.log_resolution = nr_messages; + + return count; +} + +static const struct file_operations libie_debugfs_nr_messages_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = libie_debugfs_nr_messages_read, + .write = libie_debugfs_nr_messages_write, +}; + +/** + * libie_debugfs_enable_read - read from 'enable' file + * @filp: the opened file + * @buffer: where to write the data for the user to read + * @count: the size of the user's buffer + * @ppos: file position offset + */ +static ssize_t libie_debugfs_enable_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + char buff[32] = {}; + + snprintf(buff, sizeof(buff), "%u\n", + (u16)(fwlog->cfg.options & + LIBIE_FWLOG_OPTION_IS_REGISTERED) >> 3); + + return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); +} + +/** + * libie_debugfs_enable_write - write into 'enable' file + * @filp: the opened file + * @buf: where to find the user's data + * @count: the length of the user's data + * @ppos: file position offset + */ +static ssize_t +libie_debugfs_enable_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + char user_val[8], *cmd_buf; + bool enable; + ssize_t ret; + + /* don't allow partial writes or invalid input */ + if (*ppos != 0 || count > 2) + return -EINVAL; + + cmd_buf = memdup_user_nul(buf, count); + if (IS_ERR(cmd_buf)) + return PTR_ERR(cmd_buf); + + ret = sscanf(cmd_buf, "%s", user_val); + if (ret != 1) + return -EINVAL; + + ret = kstrtobool(user_val, &enable); + if (ret) + goto enable_write_error; + + if (enable) + fwlog->cfg.options |= LIBIE_FWLOG_OPTION_ARQ_ENA; + else + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; + + ret = libie_fwlog_set(fwlog, &fwlog->cfg); + if (ret) + goto enable_write_error; + + if (enable) + ret = libie_fwlog_register(fwlog); + else + ret = libie_fwlog_unregister(fwlog); + + if (ret) + goto enable_write_error; + + /* if we get here, nothing went wrong; return count since we didn't + * really write anything + */ + ret = (ssize_t)count; + +enable_write_error: + /* This function always consumes all of the written input, or produces + * an error. Check and enforce this. Otherwise, the write operation + * won't complete properly. + */ + if (WARN_ON(ret != (ssize_t)count && ret >= 0)) + ret = -EIO; + + return ret; +} + +static const struct file_operations libie_debugfs_enable_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = libie_debugfs_enable_read, + .write = libie_debugfs_enable_write, +}; + +/** + * libie_debugfs_log_size_read - read from 'log_size' file + * @filp: the opened file + * @buffer: where to write the data for the user to read + * @count: the size of the user's buffer + * @ppos: file position offset + */ +static ssize_t libie_debugfs_log_size_read(struct file *filp, + char __user *buffer, size_t count, + loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + char buff[32] = {}; + int index; + + index = fwlog->ring.index; + snprintf(buff, sizeof(buff), "%s\n", libie_fwlog_log_size[index]); + + return simple_read_from_buffer(buffer, count, ppos, buff, strlen(buff)); +} + +/** + * libie_debugfs_log_size_write - write into 'log_size' file + * @filp: the opened file + * @buf: where to find the user's data + * @count: the length of the user's data + * @ppos: file position offset + */ +static ssize_t +libie_debugfs_log_size_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + struct device *dev = &fwlog->pdev->dev; + char user_val[8], *cmd_buf; + ssize_t ret; + int index; + + /* don't allow partial writes or invalid input */ + if (*ppos != 0 || count > 5) + return -EINVAL; + + cmd_buf = memdup_user_nul(buf, count); + if (IS_ERR(cmd_buf)) + return PTR_ERR(cmd_buf); + + ret = sscanf(cmd_buf, "%s", user_val); + if (ret != 1) + return -EINVAL; + + index = sysfs_match_string(libie_fwlog_log_size, user_val); + if (index < 0) { + dev_info(dev, "Invalid log size '%s'. The value must be one of 128K, 256K, 512K, 1M, 2M\n", + user_val); + ret = -EINVAL; + goto log_size_write_error; + } else if (fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED) { + dev_info(dev, "FW logging is currently running. Please disable FW logging to change log_size\n"); + ret = -EINVAL; + goto log_size_write_error; + } + + /* free all the buffers and the tracking info and resize */ + libie_fwlog_realloc_rings(fwlog, index); + + /* if we get here, nothing went wrong; return count since we didn't + * really write anything + */ + ret = (ssize_t)count; + +log_size_write_error: + /* This function always consumes all of the written input, or produces + * an error. Check and enforce this. Otherwise, the write operation + * won't complete properly. + */ + if (WARN_ON(ret != (ssize_t)count && ret >= 0)) + ret = -EIO; + + return ret; +} + +static const struct file_operations libie_debugfs_log_size_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = libie_debugfs_log_size_read, + .write = libie_debugfs_log_size_write, +}; + +/** + * libie_debugfs_data_read - read from 'data' file + * @filp: the opened file + * @buffer: where to write the data for the user to read + * @count: the size of the user's buffer + * @ppos: file position offset + */ +static ssize_t libie_debugfs_data_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + int data_copied = 0; + bool done = false; + + if (libie_fwlog_ring_empty(&fwlog->ring)) + return 0; + + while (!libie_fwlog_ring_empty(&fwlog->ring) && !done) { + struct libie_fwlog_data *log; + u16 cur_buf_len; + + log = &fwlog->ring.rings[fwlog->ring.head]; + cur_buf_len = log->data_size; + if (cur_buf_len >= count) { + done = true; + continue; + } + + if (copy_to_user(buffer, log->data, cur_buf_len)) { + /* if there is an error then bail and return whatever + * the driver has copied so far + */ + done = true; + continue; + } + + data_copied += cur_buf_len; + buffer += cur_buf_len; + count -= cur_buf_len; + *ppos += cur_buf_len; + libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); + } + + return data_copied; +} + +/** + * libie_debugfs_data_write - write into 'data' file + * @filp: the opened file + * @buf: where to find the user's data + * @count: the length of the user's data + * @ppos: file position offset + */ +static ssize_t +libie_debugfs_data_write(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) +{ + struct libie_fwlog *fwlog = filp->private_data; + struct device *dev = &fwlog->pdev->dev; + ssize_t ret; + + /* don't allow partial writes */ + if (*ppos != 0) + return 0; + + /* any value is allowed to clear the buffer so no need to even look at + * what the value is + */ + if (!(fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED)) { + fwlog->ring.head = 0; + fwlog->ring.tail = 0; + } else { + dev_info(dev, "Can't clear FW log data while FW log running\n"); + ret = -EINVAL; + goto nr_buffs_write_error; + } + + /* if we get here, nothing went wrong; return count since we didn't + * really write anything + */ + ret = (ssize_t)count; + +nr_buffs_write_error: + /* This function always consumes all of the written input, or produces + * an error. Check and enforce this. Otherwise, the write operation + * won't complete properly. + */ + if (WARN_ON(ret != (ssize_t)count && ret >= 0)) + ret = -EIO; + + return ret; +} + +static const struct file_operations libie_debugfs_data_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = libie_debugfs_data_read, + .write = libie_debugfs_data_write, +}; + +/** + * libie_debugfs_fwlog_init - setup the debugfs directory + * @fwlog: pointer to the fwlog structure + * @root: debugfs root entry on which fwlog director will be registered + */ +static void libie_debugfs_fwlog_init(struct libie_fwlog *fwlog, + struct dentry *root) +{ + struct dentry *fw_modules_dir; + struct dentry **fw_modules; + int i; + + /* allocate space for this first because if it fails then we don't + * need to unwind + */ + fw_modules = kcalloc(LIBIE_NR_FW_LOG_MODULES, sizeof(*fw_modules), + GFP_KERNEL); + if (!fw_modules) + return; + + fwlog->debugfs = debugfs_create_dir("fwlog", root); + if (IS_ERR(fwlog->debugfs)) + goto err_create_module_files; + + fw_modules_dir = debugfs_create_dir("modules", fwlog->debugfs); + if (IS_ERR(fw_modules_dir)) + goto err_create_module_files; + + for (i = 0; i < LIBIE_NR_FW_LOG_MODULES; i++) { + fw_modules[i] = debugfs_create_file(libie_fwlog_module_string[i], + 0600, fw_modules_dir, fwlog, + &libie_debugfs_module_fops); + if (IS_ERR(fw_modules[i])) + goto err_create_module_files; + } + + debugfs_create_file("nr_messages", 0600, fwlog->debugfs, fwlog, + &libie_debugfs_nr_messages_fops); + + fwlog->debugfs_modules = fw_modules; + + debugfs_create_file("enable", 0600, fwlog->debugfs, fwlog, + &libie_debugfs_enable_fops); + + debugfs_create_file("log_size", 0600, fwlog->debugfs, fwlog, + &libie_debugfs_log_size_fops); + + debugfs_create_file("data", 0600, fwlog->debugfs, fwlog, + &libie_debugfs_data_fops); + + return; + +err_create_module_files: + debugfs_remove_recursive(fwlog->debugfs); + kfree(fw_modules); +} + +static bool libie_fwlog_ring_full(struct libie_fwlog_ring *rings) +{ + u16 head, tail; + + head = rings->head; + tail = rings->tail; + + if (head < tail && (tail - head == (rings->size - 1))) + return true; + else if (head > tail && (tail == (head - 1))) + return true; + + return false; +} + +/** + * libie_aq_fwlog_get - Get the current firmware logging configuration (0xFF32) + * @fwlog: pointer to the fwlog structure + * @cfg: firmware logging configuration to populate + */ +static int libie_aq_fwlog_get(struct libie_fwlog *fwlog, + struct libie_fwlog_cfg *cfg) +{ + struct libie_aqc_fw_log_cfg_resp *fw_modules; + struct libie_aq_desc desc = {0}; + struct libie_aqc_fw_log *cmd; + u16 module_id_cnt; + int status; + void *buf; + int i; + + memset(cfg, 0, sizeof(*cfg)); + + buf = kzalloc(LIBIE_AQ_MAX_BUF_LEN, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + desc.opcode = cpu_to_le16(libie_aqc_opc_fw_logs_query); + desc.flags = cpu_to_le16(LIBIE_AQ_FLAG_SI); + cmd = libie_aq_raw(&desc); + + cmd->cmd_flags = LIBIE_AQC_FW_LOG_AQ_QUERY; + + status = fwlog->send_cmd(fwlog->priv, &desc, buf, LIBIE_AQ_MAX_BUF_LEN); + if (status) { + dev_dbg(&fwlog->pdev->dev, "Failed to get FW log configuration\n"); + goto status_out; + } + + module_id_cnt = le16_to_cpu(cmd->ops.cfg.mdl_cnt); + if (module_id_cnt < LIBIE_AQC_FW_LOG_ID_MAX) { + dev_dbg(&fwlog->pdev->dev, "FW returned less than the expected number of FW log module IDs\n"); + } else if (module_id_cnt > LIBIE_AQC_FW_LOG_ID_MAX) { + dev_dbg(&fwlog->pdev->dev, "FW returned more than expected number of FW log module IDs, setting module_id_cnt to software expected max %u\n", + LIBIE_AQC_FW_LOG_ID_MAX); + module_id_cnt = LIBIE_AQC_FW_LOG_ID_MAX; + } + + cfg->log_resolution = le16_to_cpu(cmd->ops.cfg.log_resolution); + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_AQ_EN) + cfg->options |= LIBIE_FWLOG_OPTION_ARQ_ENA; + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_CONF_UART_EN) + cfg->options |= LIBIE_FWLOG_OPTION_UART_ENA; + if (cmd->cmd_flags & LIBIE_AQC_FW_LOG_QUERY_REGISTERED) + cfg->options |= LIBIE_FWLOG_OPTION_IS_REGISTERED; + + fw_modules = (struct libie_aqc_fw_log_cfg_resp *)buf; + + for (i = 0; i < module_id_cnt; i++) { + struct libie_aqc_fw_log_cfg_resp *fw_module = &fw_modules[i]; + + cfg->module_entries[i].module_id = + le16_to_cpu(fw_module->module_identifier); + cfg->module_entries[i].log_level = fw_module->log_level; + } + +status_out: + kfree(buf); + return status; +} + +/** + * libie_fwlog_set_supported - Set if FW logging is supported by FW + * @fwlog: pointer to the fwlog structure + * + * If FW returns success to the libie_aq_fwlog_get call then it supports FW + * logging, else it doesn't. Set the fwlog_supported flag accordingly. + * + * This function is only meant to be called during driver init to determine if + * the FW support FW logging. + */ +static void libie_fwlog_set_supported(struct libie_fwlog *fwlog) +{ + struct libie_fwlog_cfg *cfg; + int status; + + fwlog->supported = false; + + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (!cfg) + return; + + status = libie_aq_fwlog_get(fwlog, cfg); + if (status) + dev_dbg(&fwlog->pdev->dev, "libie_aq_fwlog_get failed, FW logging is not supported on this version of FW, status %d\n", + status); + else + fwlog->supported = true; + + kfree(cfg); +} + +/** + * libie_fwlog_init - Initialize FW logging configuration + * @fwlog: pointer to the fwlog structure + * @api: api structure to init fwlog + * + * This function should be called on driver initialization during + * libie_init_hw(). + */ +int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api) +{ + fwlog->api = *api; + libie_fwlog_set_supported(fwlog); + + if (libie_fwlog_supported(fwlog)) { + int status; + + /* read the current config from the FW and store it */ + status = libie_aq_fwlog_get(fwlog, &fwlog->cfg); + if (status) + return status; + + fwlog->ring.rings = kcalloc(LIBIE_FWLOG_RING_SIZE_DFLT, + sizeof(*fwlog->ring.rings), + GFP_KERNEL); + if (!fwlog->ring.rings) { + dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log rings\n"); + return -ENOMEM; + } + + fwlog->ring.size = LIBIE_FWLOG_RING_SIZE_DFLT; + fwlog->ring.index = LIBIE_FWLOG_RING_SIZE_INDEX_DFLT; + + status = libie_fwlog_alloc_ring_buffs(&fwlog->ring); + if (status) { + dev_warn(&fwlog->pdev->dev, "Unable to allocate memory for FW log ring data buffers\n"); + libie_fwlog_free_ring_buffs(&fwlog->ring); + kfree(fwlog->ring.rings); + return status; + } + + libie_debugfs_fwlog_init(fwlog, api->debugfs_root); + } else { + dev_warn(&fwlog->pdev->dev, "FW logging is not supported in this NVM image. Please update the NVM to get FW log support\n"); + } + + return 0; +} +EXPORT_SYMBOL_GPL(libie_fwlog_init); + +/** + * libie_fwlog_deinit - unroll FW logging configuration + * @fwlog: pointer to the fwlog structure + * + * This function should be called in libie_deinit_hw(). + */ +void libie_fwlog_deinit(struct libie_fwlog *fwlog) +{ + int status; + + /* make sure FW logging is disabled to not put the FW in a weird state + * for the next driver load + */ + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_ARQ_ENA; + status = libie_fwlog_set(fwlog, &fwlog->cfg); + if (status) + dev_warn(&fwlog->pdev->dev, "Unable to turn off FW logging, status: %d\n", + status); + + kfree(fwlog->debugfs_modules); + + fwlog->debugfs_modules = NULL; + + status = libie_fwlog_unregister(fwlog); + if (status) + dev_warn(&fwlog->pdev->dev, "Unable to unregister FW logging, status: %d\n", + status); + + if (fwlog->ring.rings) { + libie_fwlog_free_ring_buffs(&fwlog->ring); + kfree(fwlog->ring.rings); + } +} +EXPORT_SYMBOL_GPL(libie_fwlog_deinit); + +/** + * libie_get_fwlog_data - copy the FW log data from ARQ event + * @fwlog: fwlog that the FW log event is associated with + * @buf: event buffer pointer + * @len: len of event descriptor + */ +void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len) +{ + struct libie_fwlog_data *log; + + log = &fwlog->ring.rings[fwlog->ring.tail]; + + memset(log->data, 0, PAGE_SIZE); + log->data_size = len; + + memcpy(log->data, buf, log->data_size); + libie_fwlog_ring_increment(&fwlog->ring.tail, fwlog->ring.size); + + if (libie_fwlog_ring_full(&fwlog->ring)) { + /* the rings are full so bump the head to create room */ + libie_fwlog_ring_increment(&fwlog->ring.head, fwlog->ring.size); + } +} +EXPORT_SYMBOL_GPL(libie_get_fwlog_data); + +void libie_fwlog_reregister(struct libie_fwlog *fwlog) +{ + if (!(fwlog->cfg.options & LIBIE_FWLOG_OPTION_IS_REGISTERED)) + return; + + if (libie_fwlog_register(fwlog)) + fwlog->cfg.options &= ~LIBIE_FWLOG_OPTION_IS_REGISTERED; +} +EXPORT_SYMBOL_GPL(libie_fwlog_reregister); + +MODULE_DESCRIPTION("Intel(R) Ethernet common library"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/net/intel/libie/adminq.h b/include/linux/net/intel/libie/adminq.h index 29420193889a..ab13bd777a28 100644 --- a/include/linux/net/intel/libie/adminq.h +++ b/include/linux/net/intel/libie/adminq.h @@ -265,7 +265,7 @@ enum libie_aqc_fw_logging_mod { LIBIE_AQC_FW_LOG_ID_TSDRV, LIBIE_AQC_FW_LOG_ID_PFREG, LIBIE_AQC_FW_LOG_ID_MDLVER, - LIBIE_AQC_FW_LOG_ID_MAX, + LIBIE_AQC_FW_LOG_ID_MAX }; /* Set FW Logging configuration (indirect 0xFF30) @@ -280,8 +280,8 @@ enum libie_aqc_fw_logging_mod { #define LIBIE_AQC_FW_LOG_AQ_REGISTER BIT(0) #define LIBIE_AQC_FW_LOG_AQ_QUERY BIT(2) -#define LIBIE_AQC_FW_LOG_MIN_RESOLUTION (1) -#define LIBIE_AQC_FW_LOG_MAX_RESOLUTION (128) +#define LIBIE_AQC_FW_LOG_MIN_RESOLUTION 1 +#define LIBIE_AQC_FW_LOG_MAX_RESOLUTION 128 struct libie_aqc_fw_log { u8 cmd_flags; diff --git a/include/linux/net/intel/libie/fwlog.h b/include/linux/net/intel/libie/fwlog.h new file mode 100644 index 000000000000..36b13fabca9e --- /dev/null +++ b/include/linux/net/intel/libie/fwlog.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2022, Intel Corporation. */ + +#ifndef _LIBIE_FWLOG_H_ +#define _LIBIE_FWLOG_H_ + +#include + +/* Only a single log level should be set and all log levels under the set value + * are enabled, e.g. if log level is set to LIBIE_FW_LOG_LEVEL_VERBOSE, then all + * other log levels are included (except LIBIE_FW_LOG_LEVEL_NONE) + */ +enum libie_fwlog_level { + LIBIE_FWLOG_LEVEL_NONE = 0, + LIBIE_FWLOG_LEVEL_ERROR = 1, + LIBIE_FWLOG_LEVEL_WARNING = 2, + LIBIE_FWLOG_LEVEL_NORMAL = 3, + LIBIE_FWLOG_LEVEL_VERBOSE = 4, + LIBIE_FWLOG_LEVEL_INVALID, /* all values >= this entry are invalid */ +}; + +struct libie_fwlog_module_entry { + /* module ID for the corresponding firmware logging event */ + u16 module_id; + /* verbosity level for the module_id */ + u8 log_level; +}; + +struct libie_fwlog_cfg { + /* list of modules for configuring log level */ + struct libie_fwlog_module_entry module_entries[LIBIE_AQC_FW_LOG_ID_MAX]; + /* options used to configure firmware logging */ + u16 options; +#define LIBIE_FWLOG_OPTION_ARQ_ENA BIT(0) +#define LIBIE_FWLOG_OPTION_UART_ENA BIT(1) + /* set before calling libie_fwlog_init() so the PF registers for + * firmware logging on initialization + */ +#define LIBIE_FWLOG_OPTION_REGISTER_ON_INIT BIT(2) + /* set in the libie_aq_fwlog_get() response if the PF is registered for + * FW logging events over ARQ + */ +#define LIBIE_FWLOG_OPTION_IS_REGISTERED BIT(3) + + /* minimum number of log events sent per Admin Receive Queue event */ + u16 log_resolution; +}; + +struct libie_fwlog_data { + u16 data_size; + u8 *data; +}; + +struct libie_fwlog_ring { + struct libie_fwlog_data *rings; + u16 index; + u16 size; + u16 head; + u16 tail; +}; + +#define LIBIE_FWLOG_RING_SIZE_INDEX_DFLT 3 +#define LIBIE_FWLOG_RING_SIZE_DFLT 256 +#define LIBIE_FWLOG_RING_SIZE_MAX 512 + +struct libie_fwlog { + struct libie_fwlog_cfg cfg; + bool supported; /* does hardware support FW logging? */ + struct libie_fwlog_ring ring; + struct dentry *debugfs; + /* keep track of all the dentrys for FW log modules */ + struct dentry **debugfs_modules; + struct_group_tagged(libie_fwlog_api, api, + struct pci_dev *pdev; + int (*send_cmd)(void *, struct libie_aq_desc *, void *, u16); + void *priv; + struct dentry *debugfs_root; + ); +}; + +int libie_fwlog_init(struct libie_fwlog *fwlog, struct libie_fwlog_api *api); +void libie_fwlog_deinit(struct libie_fwlog *fwlog); +void libie_fwlog_reregister(struct libie_fwlog *fwlog); +void libie_get_fwlog_data(struct libie_fwlog *fwlog, u8 *buf, u16 len); +#endif /* _LIBIE_FWLOG_H_ */ -- cgit v1.2.3 From 70f23546d246563da648baedbb0432ba1d6bb357 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Sep 2025 14:58:01 +0000 Subject: bpf: core: introduce main_prog_aux for stream access BPF streams are only valid for the main programs, to make it easier to access streams from subprogs, introduce main_prog_aux in struct bpf_prog_aux. prog->aux->main_prog_aux = prog->aux, for main programs and prog->aux->main_prog_aux = main_prog->aux, for subprograms. Make bpf_prog_find_from_stack() use the added main_prog_aux to return the mainprog when a subprog is found on the stack. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250911145808.58042-3-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 6 +++--- kernel/bpf/verifier.c | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8f6e87f0f3a8..d133171c4d2a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1633,6 +1633,7 @@ struct bpf_prog_aux { /* function name for valid attach_btf_id */ const char *attach_func_name; struct bpf_prog **func; + struct bpf_prog_aux *main_prog_aux; void *jit_data; /* JIT specific data. arch dependent */ struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e68019459106..1cda2589d4b3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -120,6 +120,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->pages = size / PAGE_SIZE; fp->aux = aux; + fp->aux->main_prog_aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); @@ -3297,9 +3298,8 @@ static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) rcu_read_unlock(); if (!prog) return true; - if (bpf_is_subprog(prog)) - return true; - ctxp->prog = prog; + /* Make sure we return the main prog if we found a subprog */ + ctxp->prog = prog->aux->main_prog_aux->prog; return false; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b8b510a25b03..17fe623400a5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21601,6 +21601,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->func_info_cnt = prog->aux->func_info_cnt; func[i]->aux->poke_tab = prog->aux->poke_tab; func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; + func[i]->aux->main_prog_aux = prog->aux; for (j = 0; j < prog->aux->size_poke_tab; j++) { struct bpf_jit_poke_descriptor *poke; -- cgit v1.2.3 From 5c5240d020615f13331f4e2c559186125eddc7d3 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Sep 2025 14:58:02 +0000 Subject: bpf: Report arena faults to BPF stderr Begin reporting arena page faults and the faulting address to BPF program's stderr, this patch adds support in the arm64 and x86-64 JITs, support for other archs can be added later. The fault handlers receive the 32 bit address in the arena region so the upper 32 bits of user_vm_start is added to it before printing the address. This is what the user would expect to see as this is what is printed by bpf_printk() is you pass it an address returned by bpf_arena_alloc_pages(); Signed-off-by: Puranjay Mohan Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250911145808.58042-4-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 60 ++++++++++++++++++++++++++++++ arch/x86/net/bpf_jit_comp.c | 85 ++++++++++++++++++++++++++++++++++++++++--- include/linux/bpf.h | 6 +++ kernel/bpf/arena.c | 30 +++++++++++++++ 4 files changed, 176 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e6d1fdc1e6f5..008273a53e04 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1066,6 +1066,30 @@ static void build_epilogue(struct jit_ctx *ctx, bool was_classic) emit(A64_RET(A64_LR), ctx); } +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` field in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+-----------+----------+ + * | 31-27 | 26-22 | 21 | 20-16 | 15-0 | + * | | | | | | + * | FIXUP_REG | Unused | ARENA_ACC | ARENA_REG | OFFSET | + * +-----------+--------+-----------+-----------+----------+ + * + * - OFFSET (16 bits): Offset used to compute address for Load/Store instruction. + * - ARENA_REG (5 bits): Register that is used to calculate the address for load/store when + * accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * - FIXUP_REG (5 bits): Destination register for the load instruction (cleared on fault) or set to + * DONT_CLEAR if it is a store instruction. + */ + +#define BPF_FIXUP_OFFSET_MASK GENMASK(15, 0) +#define BPF_FIXUP_ARENA_REG_MASK GENMASK(20, 16) +#define BPF_ARENA_ACCESS BIT(21) #define BPF_FIXUP_REG_MASK GENMASK(31, 27) #define DONT_CLEAR 5 /* Unused ARM64 register from BPF's POV */ @@ -1073,11 +1097,22 @@ bool ex_handler_bpf(const struct exception_table_entry *ex, struct pt_regs *regs) { int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup); + s16 off = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup); + int arena_reg = FIELD_GET(BPF_FIXUP_ARENA_REG_MASK, ex->fixup); + bool is_arena = !!(ex->fixup & BPF_ARENA_ACCESS); + bool is_write = (dst_reg == DONT_CLEAR); + unsigned long addr; + + if (is_arena) { + addr = regs->regs[arena_reg] + off; + bpf_prog_report_arena_violation(is_write, addr, regs->pc); + } if (dst_reg != DONT_CLEAR) regs->regs[dst_reg] = 0; /* Skip the faulting instruction */ regs->pc += AARCH64_INSN_SIZE; + return true; } @@ -1087,6 +1122,9 @@ static int add_exception_handler(const struct bpf_insn *insn, int dst_reg) { off_t ins_offset; + s16 off = insn->off; + bool is_arena; + int arena_reg; unsigned long pc; struct exception_table_entry *ex; @@ -1100,6 +1138,9 @@ static int add_exception_handler(const struct bpf_insn *insn, BPF_MODE(insn->code) != BPF_PROBE_ATOMIC) return 0; + is_arena = (BPF_MODE(insn->code) == BPF_PROBE_MEM32) || + (BPF_MODE(insn->code) == BPF_PROBE_ATOMIC); + if (!ctx->prog->aux->extable || WARN_ON_ONCE(ctx->exentry_idx >= ctx->prog->aux->num_exentries)) return -EINVAL; @@ -1131,6 +1172,25 @@ static int add_exception_handler(const struct bpf_insn *insn, ex->fixup = FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg); + if (is_arena) { + ex->fixup |= BPF_ARENA_ACCESS; + /* + * insn->src_reg/dst_reg holds the address in the arena region with upper 32-bits + * being zero because of a preceding addr_space_cast(r, 0x0, 0x1) instruction. + * This address is adjusted with the addition of arena_vm_start (see the + * implementation of BPF_PROBE_MEM32 and BPF_PROBE_ATOMIC) before being used for the + * memory access. Pass the reg holding the unmodified 32-bit address to + * ex_handler_bpf. + */ + if (BPF_CLASS(insn->code) == BPF_LDX) + arena_reg = bpf2a64[insn->src_reg]; + else + arena_reg = bpf2a64[insn->dst_reg]; + + ex->fixup |= FIELD_PREP(BPF_FIXUP_OFFSET_MASK, off) | + FIELD_PREP(BPF_FIXUP_ARENA_REG_MASK, arena_reg); + } + ex->type = EX_TYPE_BPF; ctx->exentry_idx++; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7e3fca164620..8d34a9400a5e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1388,16 +1389,67 @@ static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, return 0; } +/* + * Metadata encoding for exception handling in JITed code. + * + * Format of `fixup` and `data` fields in `struct exception_table_entry`: + * + * Bit layout of `fixup` (32-bit): + * + * +-----------+--------+-----------+---------+----------+ + * | 31 | 30-24 | 23-16 | 15-8 | 7-0 | + * | | | | | | + * | ARENA_ACC | Unused | ARENA_REG | DST_REG | INSN_LEN | + * +-----------+--------+-----------+---------+----------+ + * + * - INSN_LEN (8 bits): Length of faulting insn (max x86 insn = 15 bytes (fits in 8 bits)). + * - DST_REG (8 bits): Offset of dst_reg from reg2pt_regs[] (max offset = 112 (fits in 8 bits)). + * This is set to DONT_CLEAR if the insn is a store. + * - ARENA_REG (8 bits): Offset of the register that is used to calculate the + * address for load/store when accessing the arena region. + * - ARENA_ACCESS (1 bit): This bit is set when the faulting instruction accessed the arena region. + * + * Bit layout of `data` (32-bit): + * + * +--------------+--------+--------------+ + * | 31-16 | 15-8 | 7-0 | + * | | | | + * | ARENA_OFFSET | Unused | EX_TYPE_BPF | + * +--------------+--------+--------------+ + * + * - ARENA_OFFSET (16 bits): Offset used to calculate the address for load/store when + * accessing the arena region. + */ + #define DONT_CLEAR 1 +#define FIXUP_INSN_LEN_MASK GENMASK(7, 0) +#define FIXUP_REG_MASK GENMASK(15, 8) +#define FIXUP_ARENA_REG_MASK GENMASK(23, 16) +#define FIXUP_ARENA_ACCESS BIT(31) +#define DATA_ARENA_OFFSET_MASK GENMASK(31, 16) bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) { - u32 reg = x->fixup >> 8; + u32 reg = FIELD_GET(FIXUP_REG_MASK, x->fixup); + u32 insn_len = FIELD_GET(FIXUP_INSN_LEN_MASK, x->fixup); + bool is_arena = !!(x->fixup & FIXUP_ARENA_ACCESS); + bool is_write = (reg == DONT_CLEAR); + unsigned long addr; + s16 off; + u32 arena_reg; + + if (is_arena) { + arena_reg = FIELD_GET(FIXUP_ARENA_REG_MASK, x->fixup); + off = FIELD_GET(DATA_ARENA_OFFSET_MASK, x->data); + addr = *(unsigned long *)((void *)regs + arena_reg) + off; + bpf_prog_report_arena_violation(is_write, addr, regs->ip); + } /* jump over faulting load and clear dest register */ if (reg != DONT_CLEAR) *(unsigned long *)((void *)regs + reg) = 0; - regs->ip += x->fixup & 0xff; + regs->ip += insn_len; + return true; } @@ -2070,6 +2122,7 @@ populate_extable: { struct exception_table_entry *ex; u8 *_insn = image + proglen + (start_of_ldx - temp); + u32 arena_reg, fixup_reg; s64 delta; if (!bpf_prog->aux->extable) @@ -2089,8 +2142,29 @@ populate_extable: ex->data = EX_TYPE_BPF; - ex->fixup = (prog - start_of_ldx) | - ((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8); + /* + * src_reg/dst_reg holds the address in the arena region with upper + * 32-bits being zero because of a preceding addr_space_cast(r, + * 0x0, 0x1) instruction. This address is adjusted with the addition + * of arena_vm_start (see the implementation of BPF_PROBE_MEM32 and + * BPF_PROBE_ATOMIC) before being used for the memory access. Pass + * the reg holding the unmodified 32-bit address to + * ex_handler_bpf(). + */ + if (BPF_CLASS(insn->code) == BPF_LDX) { + arena_reg = reg2pt_regs[src_reg]; + fixup_reg = reg2pt_regs[dst_reg]; + } else { + arena_reg = reg2pt_regs[dst_reg]; + fixup_reg = DONT_CLEAR; + } + + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_ARENA_REG_MASK, arena_reg) | + FIELD_PREP(FIXUP_REG_MASK, fixup_reg); + ex->fixup |= FIXUP_ARENA_ACCESS; + + ex->data |= FIELD_PREP(DATA_ARENA_OFFSET_MASK, insn->off); } break; @@ -2208,7 +2282,8 @@ populate_extable: * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" * of 4 bytes will be ignored and rbx will be zero inited. */ - ex->fixup = (prog - start_of_ldx) | (reg2pt_regs[dst_reg] << 8); + ex->fixup = FIELD_PREP(FIXUP_INSN_LEN_MASK, prog - start_of_ldx) | + FIELD_PREP(FIXUP_REG_MASK, reg2pt_regs[dst_reg]); } break; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d133171c4d2a..41f776071ff5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2881,6 +2881,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) @@ -3168,6 +3169,11 @@ static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) { } + +static inline void bpf_prog_report_arena_violation(bool write, unsigned long addr, + unsigned long fault_ip) +{ +} #endif /* CONFIG_BPF_SYSCALL */ static __always_inline int diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 5b37753799d2..1074ac4459f2 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -633,3 +633,33 @@ static int __init kfunc_init(void) return register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); + +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip) +{ + struct bpf_stream_stage ss; + struct bpf_prog *prog; + u64 user_vm_start; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it will not + * disappear while we are handling the fault. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(fault_ip); + rcu_read_unlock(); + if (!prog) + return; + + /* Use main prog for stream access */ + prog = prog->aux->main_prog_aux->prog; + + user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena); + addr += clear_lo32(user_vm_start); + + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: Arena %s access at unmapped address 0x%lx\n", + write ? "WRITE" : "READ", addr); + bpf_stream_dump_stack(ss); + })); +} -- cgit v1.2.3 From eadaa8b255f36ee39ca97d0815c25eeeb1f5d674 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:29 +0300 Subject: dma-mapping: introduce new DMA attribute to indicate MMIO memory This patch introduces the DMA_ATTR_MMIO attribute to mark DMA buffers that reside in memory-mapped I/O (MMIO) regions, such as device BARs exposed through the host bridge, which are accessible for peer-to-peer (P2P) DMA. This attribute is especially useful for exporting device memory to other devices for DMA without CPU involvement, and avoids unnecessary or potentially detrimental CPU cache maintenance calls. DMA_ATTR_MMIO is supposed to provide dma_map_resource() functionality without need to call to special function and perform branching when processing generic containers like bio_vec by the callers. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/6f058ec395c5348014860dbc2eed348c17975843.1757423202.git.leonro@nvidia.com --- Documentation/core-api/dma-attributes.rst | 18 ++++++++++++++++++ include/linux/dma-mapping.h | 20 ++++++++++++++++++++ include/trace/events/dma.h | 3 ++- rust/kernel/dma.rs | 3 +++ 4 files changed, 43 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1887d92e8e92..0bdc2be65e57 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,3 +130,21 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_MMIO +------------- + +This attribute indicates the physical address is not normal system +memory. It may not be used with kmap*()/phys_to_virt()/phys_to_page() +functions, it may not be cacheable, and access using CPU load/store +instructions may not be allowed. + +Usually this will be used to describe MMIO addresses, or other non-cacheable +register addresses. When DMA mapping this sort of address we call +the operation Peer to Peer as a one device is DMA'ing to another device. +For PCI devices the p2pdma APIs must be used to determine if +DMA_ATTR_MMIO is appropriate. + +For architectures that require cache flushing for DMA coherence +DMA_ATTR_MMIO will not perform any cache flushing. The address +provided must never be mapped cacheable into the CPU. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 55c03e5fe8cb..4254fd9bdf5d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -58,6 +58,26 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * DMA_ATTR_MMIO - Indicates memory-mapped I/O (MMIO) region for DMA mapping + * + * This attribute indicates the physical address is not normal system + * memory. It may not be used with kmap*()/phys_to_virt()/phys_to_page() + * functions, it may not be cacheable, and access using CPU load/store + * instructions may not be allowed. + * + * Usually this will be used to describe MMIO addresses, or other non-cacheable + * register addresses. When DMA mapping this sort of address we call + * the operation Peer to Peer as a one device is DMA'ing to another device. + * For PCI devices the p2pdma APIs must be used to determine if DMA_ATTR_MMIO + * is appropriate. + * + * For architectures that require cache flushing for DMA coherence + * DMA_ATTR_MMIO will not perform any cache flushing. The address + * provided must never be mapped cacheable into the CPU. + */ +#define DMA_ATTR_MMIO (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index d8ddc27b6a7c..ee90d6f1dcf3 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -31,7 +31,8 @@ TRACE_DEFINE_ENUM(DMA_NONE); { DMA_ATTR_FORCE_CONTIGUOUS, "FORCE_CONTIGUOUS" }, \ { DMA_ATTR_ALLOC_SINGLE_PAGES, "ALLOC_SINGLE_PAGES" }, \ { DMA_ATTR_NO_WARN, "NO_WARN" }, \ - { DMA_ATTR_PRIVILEGED, "PRIVILEGED" }) + { DMA_ATTR_PRIVILEGED, "PRIVILEGED" }, \ + { DMA_ATTR_MMIO, "MMIO" }) DECLARE_EVENT_CLASS(dma_map, TP_PROTO(struct device *dev, phys_addr_t phys_addr, dma_addr_t dma_addr, diff --git a/rust/kernel/dma.rs b/rust/kernel/dma.rs index 2bc8ab51ec28..61d9eed7a786 100644 --- a/rust/kernel/dma.rs +++ b/rust/kernel/dma.rs @@ -242,6 +242,9 @@ pub mod attrs { /// Indicates that the buffer is fully accessible at an elevated privilege level (and /// ideally inaccessible or at least read-only at lesser-privileged levels). pub const DMA_ATTR_PRIVILEGED: Attrs = Attrs(bindings::DMA_ATTR_PRIVILEGED); + + /// Indicates that the buffer is MMIO memory. + pub const DMA_ATTR_MMIO: Attrs = Attrs(bindings::DMA_ATTR_MMIO); } /// An abstraction of the `dma_alloc_coherent` API. -- cgit v1.2.3 From e9e81d86fee63c6d5757841ab557019ddf73786f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:31 +0300 Subject: dma-debug: refactor to use physical addresses for page mapping Convert the DMA debug infrastructure from page-based to physical address-based mapping as a preparation to rely on physical address for DMA mapping routines. The refactoring renames debug_dma_map_page() to debug_dma_map_phys() and changes its signature to accept a phys_addr_t parameter instead of struct page and offset. Similarly, debug_dma_unmap_page() becomes debug_dma_unmap_phys(). A new dma_debug_phy type is introduced to distinguish physical address mappings from other debug entry types. All callers throughout the codebase are updated to pass physical addresses directly, eliminating the need for page-to-physical conversion in the debug layer. This refactoring eliminates the need to convert between page pointers and physical addresses in the debug layer, making the code more efficient and consistent with the DMA mapping API's physical address focus. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky [mszyprow: added a fixup] Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/56d1a6769b68dfcbf8b26a75a7329aeb8e3c3b6a.1757423202.git.leonro@nvidia.com Link: https://lore.kernel.org/all/20250910052618.GH341237@unreal/ --- Documentation/core-api/dma-api.rst | 4 ++-- include/linux/page-flags.h | 1 + kernel/dma/debug.c | 39 +++++++++++++++++++------------------- kernel/dma/debug.h | 16 +++++++--------- kernel/dma/mapping.c | 10 +++++----- 5 files changed, 35 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index 3087bea715ed..ca75b3541679 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -761,7 +761,7 @@ example warning message may look like this:: [] find_busiest_group+0x207/0x8a0 [] _spin_lock_irqsave+0x1f/0x50 [] check_unmap+0x203/0x490 - [] debug_dma_unmap_page+0x49/0x50 + [] debug_dma_unmap_phys+0x49/0x50 [] nv_tx_done_optimized+0xc6/0x2c0 [] nv_nic_irq_optimized+0x73/0x2b0 [] handle_IRQ_event+0x34/0x70 @@ -855,7 +855,7 @@ that a driver may be leaking mappings. dma-debug interface debug_dma_mapping_error() to debug drivers that fail to check DMA mapping errors on addresses returned by dma_map_single() and dma_map_page() interfaces. This interface clears a flag set by -debug_dma_map_page() to indicate that dma_mapping_error() has been called by +debug_dma_map_phys() to indicate that dma_mapping_error() has been called by the driver. When driver does unmap, debug_dma_unmap() checks the flag and if this flag is still set, prints warning message that includes call trace that leads up to the unmap. This interface can be called from dma_mapping_error() diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8d3fa3a91ce4..2a1f34617802 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -618,6 +618,7 @@ FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) #else PAGEFLAG_FALSE(HighMem, highmem) #endif +#define PhysHighMem(__p) (PageHighMem(phys_to_page(__p))) /* Does kmap_local_folio() only allow access to one page of the folio? */ #ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index b82399437db0..b275db9ca6a0 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -40,6 +40,7 @@ enum { dma_debug_coherent, dma_debug_resource, dma_debug_noncoherent, + dma_debug_phy, }; enum map_err_types { @@ -143,6 +144,7 @@ static const char *type2name[] = { [dma_debug_coherent] = "coherent", [dma_debug_resource] = "resource", [dma_debug_noncoherent] = "noncoherent", + [dma_debug_phy] = "phy", }; static const char *dir2name[] = { @@ -1054,17 +1056,16 @@ static void check_unmap(struct dma_debug_entry *ref) dma_entry_free(entry); } -static void check_for_stack(struct device *dev, - struct page *page, size_t offset) +static void check_for_stack(struct device *dev, phys_addr_t phys) { void *addr; struct vm_struct *stack_vm_area = task_stack_vm_area(current); if (!stack_vm_area) { /* Stack is direct-mapped. */ - if (PageHighMem(page)) + if (PhysHighMem(phys)) return; - addr = page_address(page) + offset; + addr = phys_to_virt(phys); if (object_is_on_stack(addr)) err_printk(dev, NULL, "device driver maps memory from stack [addr=%p]\n", addr); } else { @@ -1072,10 +1073,12 @@ static void check_for_stack(struct device *dev, int i; for (i = 0; i < stack_vm_area->nr_pages; i++) { - if (page != stack_vm_area->pages[i]) + if (__phys_to_pfn(phys) != + page_to_pfn(stack_vm_area->pages[i])) continue; - addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + addr = (u8 *)current->stack + i * PAGE_SIZE + + (phys % PAGE_SIZE); err_printk(dev, NULL, "device driver maps memory from stack [probable addr=%p]\n", addr); break; } @@ -1204,9 +1207,8 @@ void debug_dma_map_single(struct device *dev, const void *addr, } EXPORT_SYMBOL(debug_dma_map_single); -void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr, - unsigned long attrs) +void debug_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + int direction, dma_addr_t dma_addr, unsigned long attrs) { struct dma_debug_entry *entry; @@ -1221,19 +1223,18 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, return; entry->dev = dev; - entry->type = dma_debug_single; - entry->paddr = page_to_phys(page) + offset; + entry->type = dma_debug_phy; + entry->paddr = phys; entry->dev_addr = dma_addr; entry->size = size; entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - check_for_stack(dev, page, offset); + if (!(attrs & DMA_ATTR_MMIO)) { + check_for_stack(dev, phys); - if (!PageHighMem(page)) { - void *addr = page_address(page) + offset; - - check_for_illegal_area(dev, addr, size); + if (!PhysHighMem(phys)) + check_for_illegal_area(dev, phys_to_virt(phys), size); } add_dma_entry(entry, attrs); @@ -1277,11 +1278,11 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) } EXPORT_SYMBOL(debug_dma_mapping_error); -void debug_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, +void debug_dma_unmap_phys(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) { struct dma_debug_entry ref = { - .type = dma_debug_single, + .type = dma_debug_phy, .dev = dev, .dev_addr = dma_addr, .size = size, @@ -1305,7 +1306,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, return; for_each_sg(sg, s, nents, i) { - check_for_stack(dev, sg_page(s), s->offset); + check_for_stack(dev, sg_phys(s)); if (!PageHighMem(sg_page(s))) check_for_illegal_area(dev, sg_virt(s), s->length); } diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index 48757ca13f31..bedae973e725 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -9,12 +9,11 @@ #define _KERNEL_DMA_DEBUG_H #ifdef CONFIG_DMA_API_DEBUG -extern void debug_dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - int direction, dma_addr_t dma_addr, +extern void debug_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, int direction, dma_addr_t dma_addr, unsigned long attrs); -extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +extern void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, int direction); extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, @@ -62,14 +61,13 @@ extern void debug_dma_free_pages(struct device *dev, struct page *page, size_t size, int direction, dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ -static inline void debug_dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - int direction, dma_addr_t dma_addr, - unsigned long attrs) +static inline void debug_dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, int direction, + dma_addr_t dma_addr, unsigned long attrs) { } -static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, +static inline void debug_dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, int direction) { } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 56de28a3b179..0b7e16c69bf1 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -157,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); + phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); @@ -165,16 +166,15 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops) || - arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size)) + arch_dma_map_page_direct(dev, phys + size)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); kmsan_handle_dma(page, offset, size, dir); - trace_dma_map_page(dev, page_to_phys(page) + offset, addr, size, dir, - attrs); - debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); + trace_dma_map_page(dev, phys, addr, size, dir, attrs); + debug_dma_map_phys(dev, phys, size, dir, addr, attrs); return addr; } @@ -194,7 +194,7 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, else ops->unmap_page(dev, addr, size, dir, attrs); trace_dma_unmap_page(dev, addr, size, dir, attrs); - debug_dma_unmap_page(dev, addr, size, dir); + debug_dma_unmap_phys(dev, addr, size, dir); } EXPORT_SYMBOL(dma_unmap_page_attrs); -- cgit v1.2.3 From 513559f73700966ded094b090c3ecc6dff877ef9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:33 +0300 Subject: iommu/dma: rename iommu_dma_*map_page to iommu_dma_*map_phys Rename the IOMMU DMA mapping functions to better reflect their actual calling convention. The functions iommu_dma_map_page() and iommu_dma_unmap_page() are renamed to iommu_dma_map_phys() and iommu_dma_unmap_phys() respectively, as they already operate on physical addresses rather than page structures. The calling convention changes from accepting (struct page *page, unsigned long offset) to (phys_addr_t phys), which eliminates the need for page-to-physical address conversion within the functions. This renaming prepares for the broader DMA API conversion from page-based to physical address-based mapping throughout the kernel. All callers are updated to pass physical addresses directly, including dma_map_page_attrs(), scatterlist mapping functions, and DMA page allocation helpers. The change simplifies the code by removing the page_to_phys() + offset calculation that was previously done inside the IOMMU functions. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/ed172f95f8f57782beae04f782813366894e98df.1757423202.git.leonro@nvidia.com --- drivers/iommu/dma-iommu.c | 14 ++++++-------- include/linux/iommu-dma.h | 7 +++---- kernel/dma/mapping.c | 4 ++-- kernel/dma/ops_helpers.c | 6 +++--- 4 files changed, 14 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index e1185ba73e23..aea119f32f96 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1195,11 +1195,9 @@ static inline size_t iova_unaligned(struct iova_domain *iovad, phys_addr_t phys, return iova_offset(iovad, phys | size); } -dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys = page_to_phys(page) + offset; bool coherent = dev_is_dma_coherent(dev); int prot = dma_info_to_prot(dir, coherent, attrs); struct iommu_domain *domain = iommu_get_dma_domain(dev); @@ -1227,7 +1225,7 @@ dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, return iova; } -void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, +void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs) { struct iommu_domain *domain = iommu_get_dma_domain(dev); @@ -1346,7 +1344,7 @@ static void iommu_dma_unmap_sg_swiotlb(struct device *dev, struct scatterlist *s int i; for_each_sg(sg, s, nents, i) - iommu_dma_unmap_page(dev, sg_dma_address(s), + iommu_dma_unmap_phys(dev, sg_dma_address(s), sg_dma_len(s), dir, attrs); } @@ -1359,8 +1357,8 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, struct scatterlist *sg, sg_dma_mark_swiotlb(sg); for_each_sg(sg, s, nents, i) { - sg_dma_address(s) = iommu_dma_map_page(dev, sg_page(s), - s->offset, s->length, dir, attrs); + sg_dma_address(s) = iommu_dma_map_phys(dev, sg_phys(s), + s->length, dir, attrs); if (sg_dma_address(s) == DMA_MAPPING_ERROR) goto out_unmap; sg_dma_len(s) = s->length; diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index 508beaa44c39..485bdffed988 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -21,10 +21,9 @@ static inline bool use_dma_iommu(struct device *dev) } #endif /* CONFIG_IOMMU_DMA */ -dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs); -void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, +dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs); +void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs); int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index bd3bb6d59d72..90ad728205b9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -169,7 +169,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, arch_dma_map_page_direct(dev, phys + size)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else if (use_dma_iommu(dev)) - addr = iommu_dma_map_page(dev, page, offset, size, dir, attrs); + addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); kmsan_handle_dma(page, offset, size, dir); @@ -190,7 +190,7 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, arch_dma_unmap_page_direct(dev, addr + size)) dma_direct_unmap_page(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) - iommu_dma_unmap_page(dev, addr, size, dir, attrs); + iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else ops->unmap_page(dev, addr, size, dir, attrs); trace_dma_unmap_phys(dev, addr, size, dir, attrs); diff --git a/kernel/dma/ops_helpers.c b/kernel/dma/ops_helpers.c index 9afd569eadb9..6f9d604d9d40 100644 --- a/kernel/dma/ops_helpers.c +++ b/kernel/dma/ops_helpers.c @@ -72,8 +72,8 @@ struct page *dma_common_alloc_pages(struct device *dev, size_t size, return NULL; if (use_dma_iommu(dev)) - *dma_handle = iommu_dma_map_page(dev, page, 0, size, dir, - DMA_ATTR_SKIP_CPU_SYNC); + *dma_handle = iommu_dma_map_phys(dev, page_to_phys(page), size, + dir, DMA_ATTR_SKIP_CPU_SYNC); else *dma_handle = ops->map_page(dev, page, 0, size, dir, DMA_ATTR_SKIP_CPU_SYNC); @@ -92,7 +92,7 @@ void dma_common_free_pages(struct device *dev, size_t size, struct page *page, const struct dma_map_ops *ops = get_dma_ops(dev); if (use_dma_iommu(dev)) - iommu_dma_unmap_page(dev, dma_handle, size, dir, + iommu_dma_unmap_phys(dev, dma_handle, size, dir, DMA_ATTR_SKIP_CPU_SYNC); else if (ops->unmap_page) ops->unmap_page(dev, dma_handle, size, dir, -- cgit v1.2.3 From e53d29f957b36ba1666331956c6ccb047bb157d2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:35 +0300 Subject: dma-mapping: convert dma_direct_*map_page to be phys_addr_t based Convert the DMA direct mapping functions to accept physical addresses directly instead of page+offset parameters. The functions were already operating on physical addresses internally, so this change eliminates the redundant page-to-physical conversion at the API boundary. The functions dma_direct_map_page() and dma_direct_unmap_page() are renamed to dma_direct_map_phys() and dma_direct_unmap_phys() respectively, with their calling convention changed from (struct page *page, unsigned long offset) to (phys_addr_t phys). Architecture-specific functions arch_dma_map_page_direct() and arch_dma_unmap_page_direct() are similarly renamed to arch_dma_map_phys_direct() and arch_dma_unmap_phys_direct(). The is_pci_p2pdma_page() checks are replaced with DMA_ATTR_MMIO checks to allow integration with dma_direct_map_resource and dma_direct_map_phys() is extended to support MMIO path either. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/bb15a22f76dc2e26683333ff54e789606cfbfcf0.1757423202.git.leonro@nvidia.com --- arch/powerpc/kernel/dma-iommu.c | 4 +-- include/linux/dma-map-ops.h | 8 +++--- kernel/dma/direct.c | 6 ++--- kernel/dma/direct.h | 57 ++++++++++++++++++++++++++--------------- kernel/dma/mapping.c | 8 +++--- 5 files changed, 49 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 4d64a5db50f3..0359ab72cd3b 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -14,7 +14,7 @@ #define can_map_direct(dev, addr) \ ((dev)->bus_dma_limit >= phys_to_dma((dev), (addr))) -bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr) +bool arch_dma_map_phys_direct(struct device *dev, phys_addr_t addr) { if (likely(!dev->bus_dma_limit)) return false; @@ -24,7 +24,7 @@ bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr) #define is_direct_handle(dev, h) ((h) >= (dev)->archdata.dma_offset) -bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle) +bool arch_dma_unmap_phys_direct(struct device *dev, dma_addr_t dma_handle) { if (likely(!dev->bus_dma_limit)) return false; diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 332b80c42b6f..10882d00cb17 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -395,15 +395,15 @@ void *arch_dma_set_uncached(void *addr, size_t size); void arch_dma_clear_uncached(void *addr, size_t size); #ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT -bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr); -bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle); +bool arch_dma_map_phys_direct(struct device *dev, phys_addr_t addr); +bool arch_dma_unmap_phys_direct(struct device *dev, dma_addr_t dma_handle); bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg, int nents); bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, int nents); #else -#define arch_dma_map_page_direct(d, a) (false) -#define arch_dma_unmap_page_direct(d, a) (false) +#define arch_dma_map_phys_direct(d, a) (false) +#define arch_dma_unmap_phys_direct(d, a) (false) #define arch_dma_map_sg_direct(d, s, n) (false) #define arch_dma_unmap_sg_direct(d, s, n) (false) #endif diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 302e89580972..ba7524f169bc 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -448,7 +448,7 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, if (sg_dma_is_bus_address(sg)) sg_dma_unmark_bus_address(sg); else - dma_direct_unmap_page(dev, sg->dma_address, + dma_direct_unmap_phys(dev, sg->dma_address, sg_dma_len(sg), dir, attrs); } } @@ -471,8 +471,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, */ break; case PCI_P2PDMA_MAP_NONE: - sg->dma_address = dma_direct_map_page(dev, sg_page(sg), - sg->offset, sg->length, dir, attrs); + sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg), + sg->length, dir, attrs); if (sg->dma_address == DMA_MAPPING_ERROR) { ret = -EIO; goto out_unmap; diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index d2c0b7e632fc..da2fadf45bcd 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -80,42 +80,57 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, arch_dma_mark_clean(paddr, size); } -static inline dma_addr_t dma_direct_map_page(struct device *dev, - struct page *page, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) +static inline dma_addr_t dma_direct_map_phys(struct device *dev, + phys_addr_t phys, size_t size, enum dma_data_direction dir, + unsigned long attrs) { - phys_addr_t phys = page_to_phys(page) + offset; - dma_addr_t dma_addr = phys_to_dma(dev, phys); + dma_addr_t dma_addr; if (is_swiotlb_force_bounce(dev)) { - if (is_pci_p2pdma_page(page)) - return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_MMIO) + goto err_overflow; + return swiotlb_map(dev, phys, size, dir, attrs); } - if (unlikely(!dma_capable(dev, dma_addr, size, true)) || - dma_kmalloc_needs_bounce(dev, size, dir)) { - if (is_pci_p2pdma_page(page)) - return DMA_MAPPING_ERROR; - if (is_swiotlb_active(dev)) - return swiotlb_map(dev, phys, size, dir, attrs); - - dev_WARN_ONCE(dev, 1, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_MMIO) { + dma_addr = phys; + if (unlikely(!dma_capable(dev, dma_addr, size, false))) + goto err_overflow; + } else { + dma_addr = phys_to_dma(dev, phys); + if (unlikely(!dma_capable(dev, dma_addr, size, true)) || + dma_kmalloc_needs_bounce(dev, size, dir)) { + if (is_swiotlb_active(dev)) + return swiotlb_map(dev, phys, size, dir, attrs); + + goto err_overflow; + } } - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) arch_sync_dma_for_device(phys, size, dir); return dma_addr; + +err_overflow: + dev_WARN_ONCE( + dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } -static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, +static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - phys_addr_t phys = dma_to_phys(dev, addr); + phys_addr_t phys; + + if (attrs & DMA_ATTR_MMIO) + /* nothing to do: uncached and no swiotlb */ + return; + phys = dma_to_phys(dev, addr); if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) dma_direct_sync_single_for_cpu(dev, addr, size, dir); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 90ad728205b9..3ac7d15e095f 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -166,8 +166,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, return DMA_MAPPING_ERROR; if (dma_map_direct(dev, ops) || - arch_dma_map_page_direct(dev, phys + size)) - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + arch_dma_map_phys_direct(dev, phys + size)) + addr = dma_direct_map_phys(dev, phys, size, dir, attrs); else if (use_dma_iommu(dev)) addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else @@ -187,8 +187,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, BUG_ON(!valid_dma_direction(dir)); if (dma_map_direct(dev, ops) || - arch_dma_unmap_page_direct(dev, addr + size)) - dma_direct_unmap_page(dev, addr, size, dir, attrs); + arch_dma_unmap_phys_direct(dev, addr + size)) + dma_direct_unmap_phys(dev, addr, size, dir, attrs); else if (use_dma_iommu(dev)) iommu_dma_unmap_phys(dev, addr, size, dir, attrs); else -- cgit v1.2.3 From 6eb1e769b2c13a33cb2ca694454a7561d3d72c0a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:36 +0300 Subject: kmsan: convert kmsan_handle_dma to use physical addresses Convert the KMSAN DMA handling function from page-based to physical address-based interface. The refactoring renames kmsan_handle_dma() parameters from accepting (struct page *page, size_t offset, size_t size) to (phys_addr_t phys, size_t size). The existing semantics where callers are expected to provide only kmap memory is continued here. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/3557cbaf66e935bc794f37d2b891ef75cbf2c80c.1757423202.git.leonro@nvidia.com --- drivers/virtio/virtio_ring.c | 4 ++-- include/linux/kmsan.h | 9 ++++----- kernel/dma/mapping.c | 3 ++- mm/kmsan/hooks.c | 10 ++++++---- tools/virtio/linux/kmsan.h | 2 +- 5 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index f5062061c408..c147145a6593 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -378,7 +378,7 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist * is initialized by the hardware. Explicitly check/unpoison it * depending on the direction. */ - kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); + kmsan_handle_dma(sg_phys(sg), sg->length, direction); *addr = (dma_addr_t)sg_phys(sg); return 0; } @@ -3157,7 +3157,7 @@ dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) { - kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, dir); + kmsan_handle_dma(virt_to_phys(ptr), size, dir); return (dma_addr_t)virt_to_phys(ptr); } diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 2b1432cc16d5..f2fd221107bb 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -182,8 +182,7 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); /** * kmsan_handle_dma() - Handle a DMA data transfer. - * @page: first page of the buffer. - * @offset: offset of the buffer within the first page. + * @phys: physical address of the buffer. * @size: buffer size. * @dir: one of possible dma_data_direction values. * @@ -192,7 +191,7 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); * * initializes the buffer, if it is copied from device; * * does both, if this is a DMA_BIDIRECTIONAL transfer. */ -void kmsan_handle_dma(struct page *page, size_t offset, size_t size, +void kmsan_handle_dma(phys_addr_t phys, size_t size, enum dma_data_direction dir); /** @@ -372,8 +371,8 @@ static inline void kmsan_iounmap_page_range(unsigned long start, { } -static inline void kmsan_handle_dma(struct page *page, size_t offset, - size_t size, enum dma_data_direction dir) +static inline void kmsan_handle_dma(phys_addr_t phys, size_t size, + enum dma_data_direction dir) { } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 3ac7d15e095f..e47bcf7cc43d 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -172,7 +172,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = iommu_dma_map_phys(dev, phys, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); - kmsan_handle_dma(page, offset, size, dir); + + kmsan_handle_dma(phys, size, dir); trace_dma_map_phys(dev, phys, addr, size, dir, attrs); debug_dma_map_phys(dev, phys, size, dir, addr, attrs); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 97de3d6194f0..fa9475e5ec4e 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -336,14 +336,16 @@ static void kmsan_handle_dma_page(const void *addr, size_t size, } /* Helper function to handle DMA data transfers. */ -void kmsan_handle_dma(struct page *page, size_t offset, size_t size, +void kmsan_handle_dma(phys_addr_t phys, size_t size, enum dma_data_direction dir) { - u64 page_offset, to_go, addr; + struct page *page = phys_to_page(phys); + u64 page_offset, to_go; + void *addr; - if (PageHighMem(page)) + if (PhysHighMem(phys)) return; - addr = (u64)page_address(page) + offset; + addr = page_to_virt(page); /* * The kernel may occasionally give us adjacent DMA pages not belonging * to the same allocation. Process them separately to avoid triggering diff --git a/tools/virtio/linux/kmsan.h b/tools/virtio/linux/kmsan.h index 272b5aa285d5..6cd2e3efd03d 100644 --- a/tools/virtio/linux/kmsan.h +++ b/tools/virtio/linux/kmsan.h @@ -4,7 +4,7 @@ #include -inline void kmsan_handle_dma(struct page *page, size_t offset, size_t size, +inline void kmsan_handle_dma(phys_addr_t phys, size_t size, enum dma_data_direction dir) { } -- cgit v1.2.3 From f7326196a781622b33bfbdabb00f5e72b5fb5679 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 9 Sep 2025 16:27:39 +0300 Subject: dma-mapping: export new dma_*map_phys() interface Introduce new DMA mapping functions dma_map_phys() and dma_unmap_phys() that operate directly on physical addresses instead of page+offset parameters. This provides a more efficient interface for drivers that already have physical addresses available. The new functions are implemented as the primary mapping layer, with the existing dma_map_page_attrs()/dma_map_resource() and dma_unmap_page_attrs()/dma_unmap_resource() functions converted to simple wrappers around the phys-based implementations. In case dma_map_page_attrs(), the struct page is converted to physical address with help of page_to_phys() function and dma_map_resource() provides physical address as is together with addition of DMA_ATTR_MMIO attribute. The old page-based API is preserved in mapping.c to ensure that existing code won't be affected by changing EXPORT_SYMBOL to EXPORT_SYMBOL_GPL variant for dma_*map_phys(). Reviewed-by: Jason Gunthorpe Reviewed-by: Keith Busch Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/54cc52af91777906bbe4a386113437ba0bcfba9c.1757423202.git.leonro@nvidia.com --- drivers/iommu/dma-iommu.c | 14 --------- include/linux/dma-direct.h | 2 -- include/linux/dma-mapping.h | 13 +++++++++ include/linux/iommu-dma.h | 4 --- include/trace/events/dma.h | 2 -- kernel/dma/debug.c | 43 ---------------------------- kernel/dma/debug.h | 21 -------------- kernel/dma/direct.c | 16 ----------- kernel/dma/mapping.c | 69 ++++++++++++++++++++++++--------------------- 9 files changed, 50 insertions(+), 134 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 6804aaf034a1..7944a3af4545 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1556,20 +1556,6 @@ void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, __iommu_dma_unmap(dev, start, end - start); } -dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - return __iommu_dma_map(dev, phys, size, - dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO, - dma_get_mask(dev)); -} - -void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - __iommu_dma_unmap(dev, handle, size); -} - static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) { size_t alloc_size = PAGE_ALIGN(size); diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index f3bc0bcd7098..c249912456f9 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -149,7 +149,5 @@ void dma_direct_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_addr, enum dma_data_direction dir); int dma_direct_supported(struct device *dev, u64 mask); -dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir, unsigned long attrs); #endif /* _LINUX_DMA_DIRECT_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 4254fd9bdf5d..8248ff9363ee 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -138,6 +138,10 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, unsigned long attrs); void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); +dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs); +void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs); unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, @@ -192,6 +196,15 @@ static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } +static inline dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + return DMA_MAPPING_ERROR; +} +static inline void dma_unmap_phys(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ +} static inline unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index 485bdffed988..a92b3ff9b934 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -42,10 +42,6 @@ size_t iommu_dma_opt_mapping_size(void); size_t iommu_dma_max_mapping_size(struct device *dev); void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle, unsigned long attrs); -dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, - size_t size, enum dma_data_direction dir, unsigned long attrs); -void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, unsigned long attrs); struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); void iommu_dma_free_noncontiguous(struct device *dev, size_t size, diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h index 84416c7d6bfa..5da59fd8121d 100644 --- a/include/trace/events/dma.h +++ b/include/trace/events/dma.h @@ -73,7 +73,6 @@ DEFINE_EVENT(dma_map, name, \ TP_ARGS(dev, phys_addr, dma_addr, size, dir, attrs)) DEFINE_MAP_EVENT(dma_map_phys); -DEFINE_MAP_EVENT(dma_map_resource); DECLARE_EVENT_CLASS(dma_unmap, TP_PROTO(struct device *dev, dma_addr_t addr, size_t size, @@ -111,7 +110,6 @@ DEFINE_EVENT(dma_unmap, name, \ TP_ARGS(dev, addr, size, dir, attrs)) DEFINE_UNMAP_EVENT(dma_unmap_phys); -DEFINE_UNMAP_EVENT(dma_unmap_resource); DECLARE_EVENT_CLASS(dma_alloc_class, TP_PROTO(struct device *dev, void *virt_addr, dma_addr_t dma_addr, diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index b275db9ca6a0..1e5c64cb6a42 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -38,7 +38,6 @@ enum { dma_debug_single, dma_debug_sg, dma_debug_coherent, - dma_debug_resource, dma_debug_noncoherent, dma_debug_phy, }; @@ -142,7 +141,6 @@ static const char *type2name[] = { [dma_debug_single] = "single", [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", - [dma_debug_resource] = "resource", [dma_debug_noncoherent] = "noncoherent", [dma_debug_phy] = "phy", }; @@ -1446,47 +1444,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size, check_unmap(&ref); } -void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, - int direction, dma_addr_t dma_addr, - unsigned long attrs) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_resource; - entry->dev = dev; - entry->paddr = addr; - entry->size = size; - entry->dev_addr = dma_addr; - entry->direction = direction; - entry->map_err_type = MAP_ERR_NOT_CHECKED; - - add_dma_entry(entry, attrs); -} - -void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - struct dma_debug_entry ref = { - .type = dma_debug_resource, - .dev = dev, - .dev_addr = dma_addr, - .size = size, - .direction = direction, - }; - - if (unlikely(dma_debug_disabled())) - return; - - check_unmap(&ref); -} - void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index bedae973e725..da7be0bddcf6 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -30,14 +30,6 @@ extern void debug_dma_alloc_coherent(struct device *dev, size_t size, extern void debug_dma_free_coherent(struct device *dev, size_t size, void *virt, dma_addr_t addr); -extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr, - size_t size, int direction, - dma_addr_t dma_addr, - unsigned long attrs); - -extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction); - extern void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction); @@ -95,19 +87,6 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size, { } -static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr, - size_t size, int direction, - dma_addr_t dma_addr, - unsigned long attrs) -{ -} - -static inline void debug_dma_unmap_resource(struct device *dev, - dma_addr_t dma_addr, size_t size, - int direction) -{ -} - static inline void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index ba7524f169bc..1f9ee9759426 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -497,22 +497,6 @@ out_unmap: return ret; } -dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t dma_addr = paddr; - - if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - dev_err_once(dev, - "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - WARN_ON_ONCE(1); - return DMA_MAPPING_ERROR; - } - - return dma_addr; -} - int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 95eab531e227..fe7472f13b10 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -152,12 +152,10 @@ static inline bool dma_map_direct(struct device *dev, return dma_go_direct(dev, *dev->dma_mask, ops); } -dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, - size_t offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); - phys_addr_t phys = page_to_phys(page) + offset; bool is_mmio = attrs & DMA_ATTR_MMIO; dma_addr_t addr; @@ -177,6 +175,9 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = ops->map_resource(dev, phys, size, dir, attrs); } else { + struct page *page = phys_to_page(phys); + size_t offset = offset_in_page(phys); + /* * The dma_ops API contract for ops->map_page() requires * kmappable memory, while ops->map_resource() does not. @@ -191,9 +192,26 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, return addr; } +EXPORT_SYMBOL_GPL(dma_map_phys); + +dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, + size_t offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + + if (unlikely(attrs & DMA_ATTR_MMIO)) + return DMA_MAPPING_ERROR; + + if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && + WARN_ON_ONCE(is_zone_device_page(page))) + return DMA_MAPPING_ERROR; + + return dma_map_phys(dev, phys, size, dir, attrs); +} EXPORT_SYMBOL(dma_map_page_attrs); -void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, +void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -213,6 +231,16 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, trace_dma_unmap_phys(dev, addr, size, dir, attrs); debug_dma_unmap_phys(dev, addr, size, dir); } +EXPORT_SYMBOL_GPL(dma_unmap_phys); + +void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + if (unlikely(attrs & DMA_ATTR_MMIO)) + return; + + dma_unmap_phys(dev, addr, size, dir, attrs); +} EXPORT_SYMBOL(dma_unmap_page_attrs); static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, @@ -338,41 +366,18 @@ EXPORT_SYMBOL(dma_unmap_sg_attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); - dma_addr_t addr = DMA_MAPPING_ERROR; - - BUG_ON(!valid_dma_direction(dir)); - - if (WARN_ON_ONCE(!dev->dma_mask)) + if (IS_ENABLED(CONFIG_DMA_API_DEBUG) && + WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr)))) return DMA_MAPPING_ERROR; - if (dma_map_direct(dev, ops)) - addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs); - else if (use_dma_iommu(dev)) - addr = iommu_dma_map_resource(dev, phys_addr, size, dir, attrs); - else if (ops->map_resource) - addr = ops->map_resource(dev, phys_addr, size, dir, attrs); - - trace_dma_map_resource(dev, phys_addr, addr, size, dir, attrs); - debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs); - return addr; + return dma_map_phys(dev, phys_addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_map_resource); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - const struct dma_map_ops *ops = get_dma_ops(dev); - - BUG_ON(!valid_dma_direction(dir)); - if (dma_map_direct(dev, ops)) - ; /* nothing to do: uncached and no swiotlb */ - else if (use_dma_iommu(dev)) - iommu_dma_unmap_resource(dev, addr, size, dir, attrs); - else if (ops->unmap_resource) - ops->unmap_resource(dev, addr, size, dir, attrs); - trace_dma_unmap_resource(dev, addr, size, dir, attrs); - debug_dma_unmap_resource(dev, addr, size, dir); + dma_unmap_phys(dev, addr, size, dir, attrs | DMA_ATTR_MMIO); } EXPORT_SYMBOL(dma_unmap_resource); -- cgit v1.2.3 From 5f790208d68fe1526c751dc2af366c7b552b8631 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 10 Sep 2025 22:42:47 +0200 Subject: net: phy: fixed_phy: remove two function stubs Remove stubs for fixed_phy_set_link_update() and fixed_phy_change_carrier() because all callers (actually just one per function) select config symbol FIXED_PHY. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/8729170d-cf39-48d9-aabc-c9aa4acda070@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 6227a1bdefec..d17ff750c708 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -37,16 +37,6 @@ fixed_phy_register(const struct fixed_phy_status *status, static inline void fixed_phy_unregister(struct phy_device *phydev) { } -static inline int fixed_phy_set_link_update(struct phy_device *phydev, - int (*link_update)(struct net_device *, - struct fixed_phy_status *)) -{ - return -ENODEV; -} -static inline int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier) -{ - return -EINVAL; -} #endif /* CONFIG_FIXED_PHY */ #endif /* __PHY_FIXED_H */ -- cgit v1.2.3 From 9b90afa6d613b66ec4e74ae75f9bfa5baf386ecd Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 10 Sep 2025 09:12:51 +0200 Subject: gpio: move gpio-mmio-specific fields out of struct gpio_chip With all users of bgpio_init() converted to using the modernized generic GPIO chip API, we can now move the gpio-mmio-specific fields out of struct gpio_chip and into the dedicated struct gpio_generic_chip. To that end: adjust the gpio-mmio driver to the new layout, update the docs, etc. The changes in gpio-mlxbf2.c and gpio-mpc8xxx.c are here and not in their respective conversion commits because the former passes the address of the generic chip's lock to the __releases() annotation and we cannot really hide it while gpio-mpc8xxx.c accesses the shadow registers in a driver-specific workaround and there's no reason to make them available in a public API. Also: drop the relevant task from TODO as it's now done. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250910-gpio-mmio-gpio-conv-part4-v2-15-f3d1a4c57124@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/TODO | 5 - drivers/gpio/gpio-mlxbf2.c | 2 +- drivers/gpio/gpio-mmio.c | 321 ++++++++++++++++++++++--------------------- drivers/gpio/gpio-mpc8xxx.c | 5 +- include/linux/gpio/driver.h | 44 ------ include/linux/gpio/generic.h | 67 ++++++--- 6 files changed, 211 insertions(+), 233 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO index b797499e627e..8ed74e05903a 100644 --- a/drivers/gpio/TODO +++ b/drivers/gpio/TODO @@ -131,11 +131,6 @@ Work items: helpers (x86 inb()/outb()) and convert port-mapped I/O drivers to use this with dry-coding and sending to maintainers to test -- Move the MMIO GPIO specific fields out of struct gpio_chip into a - dedicated structure. Currently every GPIO chip has them if gpio-mmio is - enabled in Kconfig even if it itself doesn't register with the helper - library. - ------------------------------------------------------------------------------- Generic regmap GPIO diff --git a/drivers/gpio/gpio-mlxbf2.c b/drivers/gpio/gpio-mlxbf2.c index 7e3b526a6caa..abffce3894fc 100644 --- a/drivers/gpio/gpio-mlxbf2.c +++ b/drivers/gpio/gpio-mlxbf2.c @@ -156,7 +156,7 @@ static int mlxbf2_gpio_lock_acquire(struct mlxbf2_gpio_context *gs) * Release the YU arm_gpio_lock after changing the direction mode. */ static void mlxbf2_gpio_lock_release(struct mlxbf2_gpio_context *gs) - __releases(&gs->chip.gc.bgpio_lock) + __releases(&gs->chip.lock) __releases(yu_arm_gpio_lock_param.lock) { writel(YU_ARM_GPIO_LOCK_RELEASE, yu_arm_gpio_lock_param.io); diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index b4f0ab0daaeb..a3df14d672a9 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -125,20 +125,23 @@ static unsigned long bgpio_read32be(void __iomem *reg) static unsigned long bgpio_line2mask(struct gpio_chip *gc, unsigned int line) { - if (gc->be_bits) - return BIT(gc->bgpio_bits - 1 - line); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (chip->be_bits) + return BIT(chip->bits - 1 - line); return BIT(line); } static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long pinmask = bgpio_line2mask(gc, gpio); - bool dir = !!(gc->bgpio_dir & pinmask); + bool dir = !!(chip->sdir & pinmask); if (dir) - return !!(gc->read_reg(gc->reg_set) & pinmask); - else - return !!(gc->read_reg(gc->reg_dat) & pinmask); + return !!(chip->read_reg(chip->reg_set) & pinmask); + + return !!(chip->read_reg(chip->reg_dat) & pinmask); } /* @@ -148,26 +151,28 @@ static int bgpio_get_set(struct gpio_chip *gc, unsigned int gpio) static int bgpio_get_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - unsigned long get_mask = 0; - unsigned long set_mask = 0; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long get_mask = 0, set_mask = 0; /* Make sure we first clear any bits that are zero when we read the register */ *bits &= ~*mask; - set_mask = *mask & gc->bgpio_dir; - get_mask = *mask & ~gc->bgpio_dir; + set_mask = *mask & chip->sdir; + get_mask = *mask & ~chip->sdir; if (set_mask) - *bits |= gc->read_reg(gc->reg_set) & set_mask; + *bits |= chip->read_reg(chip->reg_set) & set_mask; if (get_mask) - *bits |= gc->read_reg(gc->reg_dat) & get_mask; + *bits |= chip->read_reg(chip->reg_dat) & get_mask; return 0; } static int bgpio_get(struct gpio_chip *gc, unsigned int gpio) { - return !!(gc->read_reg(gc->reg_dat) & bgpio_line2mask(gc, gpio)); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + return !!(chip->read_reg(chip->reg_dat) & bgpio_line2mask(gc, gpio)); } /* @@ -176,9 +181,11 @@ static int bgpio_get(struct gpio_chip *gc, unsigned int gpio) static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + /* Make sure we first clear any bits that are zero when we read the register */ *bits &= ~*mask; - *bits |= gc->read_reg(gc->reg_dat) & *mask; + *bits |= chip->read_reg(chip->reg_dat) & *mask; return 0; } @@ -188,6 +195,7 @@ static int bgpio_get_multiple(struct gpio_chip *gc, unsigned long *mask, static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long readmask = 0; unsigned long val; int bit; @@ -200,7 +208,7 @@ static int bgpio_get_multiple_be(struct gpio_chip *gc, unsigned long *mask, readmask |= bgpio_line2mask(gc, bit); /* Read the register */ - val = gc->read_reg(gc->reg_dat) & readmask; + val = chip->read_reg(chip->reg_dat) & readmask; /* * Mirror the result into the "bits" result, this will give line 0 @@ -219,19 +227,20 @@ static int bgpio_set_none(struct gpio_chip *gc, unsigned int gpio, int val) static int bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long mask = bgpio_line2mask(gc, gpio); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); if (val) - gc->bgpio_data |= mask; + chip->sdata |= mask; else - gc->bgpio_data &= ~mask; + chip->sdata &= ~mask; - gc->write_reg(gc->reg_dat, gc->bgpio_data); + chip->write_reg(chip->reg_dat, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return 0; } @@ -239,31 +248,32 @@ static int bgpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int bgpio_set_with_clear(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long mask = bgpio_line2mask(gc, gpio); if (val) - gc->write_reg(gc->reg_set, mask); + chip->write_reg(chip->reg_set, mask); else - gc->write_reg(gc->reg_clr, mask); + chip->write_reg(chip->reg_clr, mask); return 0; } static int bgpio_set_set(struct gpio_chip *gc, unsigned int gpio, int val) { - unsigned long mask = bgpio_line2mask(gc, gpio); - unsigned long flags; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long mask = bgpio_line2mask(gc, gpio), flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); if (val) - gc->bgpio_data |= mask; + chip->sdata |= mask; else - gc->bgpio_data &= ~mask; + chip->sdata &= ~mask; - gc->write_reg(gc->reg_set, gc->bgpio_data); + chip->write_reg(chip->reg_set, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return 0; } @@ -273,12 +283,13 @@ static void bgpio_multiple_get_masks(struct gpio_chip *gc, unsigned long *set_mask, unsigned long *clear_mask) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); int i; *set_mask = 0; *clear_mask = 0; - for_each_set_bit(i, mask, gc->bgpio_bits) { + for_each_set_bit(i, mask, chip->bits) { if (test_bit(i, bits)) *set_mask |= bgpio_line2mask(gc, i); else @@ -291,25 +302,27 @@ static void bgpio_set_multiple_single_reg(struct gpio_chip *gc, unsigned long *bits, void __iomem *reg) { - unsigned long flags; - unsigned long set_mask, clear_mask; + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + unsigned long flags, set_mask, clear_mask; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); bgpio_multiple_get_masks(gc, mask, bits, &set_mask, &clear_mask); - gc->bgpio_data |= set_mask; - gc->bgpio_data &= ~clear_mask; + chip->sdata |= set_mask; + chip->sdata &= ~clear_mask; - gc->write_reg(reg, gc->bgpio_data); + chip->write_reg(reg, chip->sdata); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); } static int bgpio_set_multiple(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - bgpio_set_multiple_single_reg(gc, mask, bits, gc->reg_dat); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + bgpio_set_multiple_single_reg(gc, mask, bits, chip->reg_dat); return 0; } @@ -317,7 +330,9 @@ static int bgpio_set_multiple(struct gpio_chip *gc, unsigned long *mask, static int bgpio_set_multiple_set(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { - bgpio_set_multiple_single_reg(gc, mask, bits, gc->reg_set); + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + bgpio_set_multiple_single_reg(gc, mask, bits, chip->reg_set); return 0; } @@ -326,21 +341,24 @@ static int bgpio_set_multiple_with_clear(struct gpio_chip *gc, unsigned long *mask, unsigned long *bits) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long set_mask, clear_mask; bgpio_multiple_get_masks(gc, mask, bits, &set_mask, &clear_mask); if (set_mask) - gc->write_reg(gc->reg_set, set_mask); + chip->write_reg(chip->reg_set, set_mask); if (clear_mask) - gc->write_reg(gc->reg_clr, clear_mask); + chip->write_reg(chip->reg_clr, clear_mask); return 0; } static int bgpio_dir_return(struct gpio_chip *gc, unsigned int gpio, bool dir_out) { - if (!gc->bgpio_pinctrl) + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (!chip->pinctrl) return 0; if (dir_out) @@ -375,39 +393,42 @@ static int bgpio_simple_dir_out(struct gpio_chip *gc, unsigned int gpio, static int bgpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); - gc->bgpio_dir &= ~bgpio_line2mask(gc, gpio); + chip->sdir &= ~bgpio_line2mask(gc, gpio); - if (gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); - if (gc->reg_dir_out) - gc->write_reg(gc->reg_dir_out, gc->bgpio_dir); + if (chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); + if (chip->reg_dir_out) + chip->write_reg(chip->reg_dir_out, chip->sdir); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); return bgpio_dir_return(gc, gpio, false); } static int bgpio_get_dir(struct gpio_chip *gc, unsigned int gpio) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + /* Return 0 if output, 1 if input */ - if (gc->bgpio_dir_unreadable) { - if (gc->bgpio_dir & bgpio_line2mask(gc, gpio)) + if (chip->dir_unreadable) { + if (chip->sdir & bgpio_line2mask(gc, gpio)) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; } - if (gc->reg_dir_out) { - if (gc->read_reg(gc->reg_dir_out) & bgpio_line2mask(gc, gpio)) + if (chip->reg_dir_out) { + if (chip->read_reg(chip->reg_dir_out) & bgpio_line2mask(gc, gpio)) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; } - if (gc->reg_dir_in) - if (!(gc->read_reg(gc->reg_dir_in) & bgpio_line2mask(gc, gpio))) + if (chip->reg_dir_in) + if (!(chip->read_reg(chip->reg_dir_in) & bgpio_line2mask(gc, gpio))) return GPIO_LINE_DIRECTION_OUT; return GPIO_LINE_DIRECTION_IN; @@ -415,18 +436,19 @@ static int bgpio_get_dir(struct gpio_chip *gc, unsigned int gpio) static void bgpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); unsigned long flags; - raw_spin_lock_irqsave(&gc->bgpio_lock, flags); + raw_spin_lock_irqsave(&chip->lock, flags); - gc->bgpio_dir |= bgpio_line2mask(gc, gpio); + chip->sdir |= bgpio_line2mask(gc, gpio); - if (gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); - if (gc->reg_dir_out) - gc->write_reg(gc->reg_dir_out, gc->bgpio_dir); + if (chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); + if (chip->reg_dir_out) + chip->write_reg(chip->reg_dir_out, chip->sdir); - raw_spin_unlock_irqrestore(&gc->bgpio_lock, flags); + raw_spin_unlock_irqrestore(&chip->lock, flags); } static int bgpio_dir_out_dir_first(struct gpio_chip *gc, unsigned int gpio, @@ -446,31 +468,30 @@ static int bgpio_dir_out_val_first(struct gpio_chip *gc, unsigned int gpio, } static int bgpio_setup_accessors(struct device *dev, - struct gpio_chip *gc, + struct gpio_generic_chip *chip, bool byte_be) { - - switch (gc->bgpio_bits) { + switch (chip->bits) { case 8: - gc->read_reg = bgpio_read8; - gc->write_reg = bgpio_write8; + chip->read_reg = bgpio_read8; + chip->write_reg = bgpio_write8; break; case 16: if (byte_be) { - gc->read_reg = bgpio_read16be; - gc->write_reg = bgpio_write16be; + chip->read_reg = bgpio_read16be; + chip->write_reg = bgpio_write16be; } else { - gc->read_reg = bgpio_read16; - gc->write_reg = bgpio_write16; + chip->read_reg = bgpio_read16; + chip->write_reg = bgpio_write16; } break; case 32: if (byte_be) { - gc->read_reg = bgpio_read32be; - gc->write_reg = bgpio_write32be; + chip->read_reg = bgpio_read32be; + chip->write_reg = bgpio_write32be; } else { - gc->read_reg = bgpio_read32; - gc->write_reg = bgpio_write32; + chip->read_reg = bgpio_read32; + chip->write_reg = bgpio_write32; } break; #if BITS_PER_LONG >= 64 @@ -480,13 +501,13 @@ static int bgpio_setup_accessors(struct device *dev, "64 bit big endian byte order unsupported\n"); return -EINVAL; } else { - gc->read_reg = bgpio_read64; - gc->write_reg = bgpio_write64; + chip->read_reg = bgpio_read64; + chip->write_reg = bgpio_write64; } break; #endif /* BITS_PER_LONG >= 64 */ default: - dev_err(dev, "unsupported data width %u bits\n", gc->bgpio_bits); + dev_err(dev, "unsupported data width %u bits\n", chip->bits); return -EINVAL; } @@ -515,27 +536,25 @@ static int bgpio_setup_accessors(struct device *dev, * - an input direction register (named "dirin") where a 1 bit indicates * the GPIO is an input. */ -static int bgpio_setup_io(struct gpio_chip *gc, - void __iomem *dat, - void __iomem *set, - void __iomem *clr, - unsigned long flags) +static int bgpio_setup_io(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { + struct gpio_chip *gc = &chip->gc; - gc->reg_dat = dat; - if (!gc->reg_dat) + chip->reg_dat = cfg->dat; + if (!chip->reg_dat) return -EINVAL; - if (set && clr) { - gc->reg_set = set; - gc->reg_clr = clr; + if (cfg->set && cfg->clr) { + chip->reg_set = cfg->set; + chip->reg_clr = cfg->clr; gc->set = bgpio_set_with_clear; gc->set_multiple = bgpio_set_multiple_with_clear; - } else if (set && !clr) { - gc->reg_set = set; + } else if (cfg->set && !cfg->clr) { + chip->reg_set = cfg->set; gc->set = bgpio_set_set; gc->set_multiple = bgpio_set_multiple_set; - } else if (flags & BGPIOF_NO_OUTPUT) { + } else if (cfg->flags & BGPIOF_NO_OUTPUT) { gc->set = bgpio_set_none; gc->set_multiple = NULL; } else { @@ -543,10 +562,10 @@ static int bgpio_setup_io(struct gpio_chip *gc, gc->set_multiple = bgpio_set_multiple; } - if (!(flags & BGPIOF_UNREADABLE_REG_SET) && - (flags & BGPIOF_READ_OUTPUT_REG_SET)) { + if (!(cfg->flags & BGPIOF_UNREADABLE_REG_SET) && + (cfg->flags & BGPIOF_READ_OUTPUT_REG_SET)) { gc->get = bgpio_get_set; - if (!gc->be_bits) + if (!chip->be_bits) gc->get_multiple = bgpio_get_set_multiple; /* * We deliberately avoid assigning the ->get_multiple() call @@ -557,7 +576,7 @@ static int bgpio_setup_io(struct gpio_chip *gc, */ } else { gc->get = bgpio_get; - if (gc->be_bits) + if (chip->be_bits) gc->get_multiple = bgpio_get_multiple_be; else gc->get_multiple = bgpio_get_multiple; @@ -566,27 +585,27 @@ static int bgpio_setup_io(struct gpio_chip *gc, return 0; } -static int bgpio_setup_direction(struct gpio_chip *gc, - void __iomem *dirout, - void __iomem *dirin, - unsigned long flags) +static int bgpio_setup_direction(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { - if (dirout || dirin) { - gc->reg_dir_out = dirout; - gc->reg_dir_in = dirin; - if (flags & BGPIOF_NO_SET_ON_INPUT) + struct gpio_chip *gc = &chip->gc; + + if (cfg->dirout || cfg->dirin) { + chip->reg_dir_out = cfg->dirout; + chip->reg_dir_in = cfg->dirin; + if (cfg->flags & BGPIOF_NO_SET_ON_INPUT) gc->direction_output = bgpio_dir_out_dir_first; else gc->direction_output = bgpio_dir_out_val_first; gc->direction_input = bgpio_dir_in; gc->get_direction = bgpio_get_dir; } else { - if (flags & BGPIOF_NO_OUTPUT) + if (cfg->flags & BGPIOF_NO_OUTPUT) gc->direction_output = bgpio_dir_out_err; else gc->direction_output = bgpio_simple_dir_out; - if (flags & BGPIOF_NO_INPUT) + if (cfg->flags & BGPIOF_NO_INPUT) gc->direction_input = bgpio_dir_in_err; else gc->direction_input = bgpio_simple_dir_in; @@ -595,117 +614,101 @@ static int bgpio_setup_direction(struct gpio_chip *gc, return 0; } -static int bgpio_request(struct gpio_chip *chip, unsigned gpio_pin) +static int bgpio_request(struct gpio_chip *gc, unsigned int gpio_pin) { - if (gpio_pin >= chip->ngpio) + struct gpio_generic_chip *chip = to_gpio_generic_chip(gc); + + if (gpio_pin >= gc->ngpio) return -EINVAL; - if (chip->bgpio_pinctrl) - return gpiochip_generic_request(chip, gpio_pin); + if (chip->pinctrl) + return gpiochip_generic_request(gc, gpio_pin); return 0; } /** - * bgpio_init() - Initialize generic GPIO accessor functions - * @gc: the GPIO chip to set up - * @dev: the parent device of the new GPIO chip (compulsory) - * @sz: the size (width) of the MMIO registers in bytes, typically 1, 2 or 4 - * @dat: MMIO address for the register to READ the value of the GPIO lines, it - * is expected that a 1 in the corresponding bit in this register means the - * line is asserted - * @set: MMIO address for the register to SET the value of the GPIO lines, it is - * expected that we write the line with 1 in this register to drive the GPIO line - * high. - * @clr: MMIO address for the register to CLEAR the value of the GPIO lines, it is - * expected that we write the line with 1 in this register to drive the GPIO line - * low. It is allowed to leave this address as NULL, in that case the SET register - * will be assumed to also clear the GPIO lines, by actively writing the line - * with 0. - * @dirout: MMIO address for the register to set the line as OUTPUT. It is assumed - * that setting a line to 1 in this register will turn that line into an - * output line. Conversely, setting the line to 0 will turn that line into - * an input. - * @dirin: MMIO address for the register to set this line as INPUT. It is assumed - * that setting a line to 1 in this register will turn that line into an - * input line. Conversely, setting the line to 0 will turn that line into - * an output. - * @flags: Different flags that will affect the behaviour of the device, such as - * endianness etc. + * gpio_generic_chip_init() - Initialize a generic GPIO chip. + * @chip: Generic GPIO chip to set up. + * @cfg: Generic GPIO chip configuration. + * + * Returns 0 on success, negative error number on failure. */ -int bgpio_init(struct gpio_chip *gc, struct device *dev, - unsigned long sz, void __iomem *dat, void __iomem *set, - void __iomem *clr, void __iomem *dirout, void __iomem *dirin, - unsigned long flags) +int gpio_generic_chip_init(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg) { + struct gpio_chip *gc = &chip->gc; + unsigned long flags = cfg->flags; + struct device *dev = cfg->dev; int ret; - if (!is_power_of_2(sz)) + if (!is_power_of_2(cfg->sz)) return -EINVAL; - gc->bgpio_bits = sz * 8; - if (gc->bgpio_bits > BITS_PER_LONG) + chip->bits = cfg->sz * 8; + if (chip->bits > BITS_PER_LONG) return -EINVAL; - raw_spin_lock_init(&gc->bgpio_lock); + raw_spin_lock_init(&chip->lock); gc->parent = dev; gc->label = dev_name(dev); gc->base = -1; gc->request = bgpio_request; - gc->be_bits = !!(flags & BGPIOF_BIG_ENDIAN); + chip->be_bits = !!(flags & BGPIOF_BIG_ENDIAN); ret = gpiochip_get_ngpios(gc, dev); if (ret) - gc->ngpio = gc->bgpio_bits; + gc->ngpio = chip->bits; - ret = bgpio_setup_io(gc, dat, set, clr, flags); + ret = bgpio_setup_io(chip, cfg); if (ret) return ret; - ret = bgpio_setup_accessors(dev, gc, flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER); + ret = bgpio_setup_accessors(dev, chip, + flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER); if (ret) return ret; - ret = bgpio_setup_direction(gc, dirout, dirin, flags); + ret = bgpio_setup_direction(chip, cfg); if (ret) return ret; if (flags & BGPIOF_PINCTRL_BACKEND) { - gc->bgpio_pinctrl = true; + chip->pinctrl = true; /* Currently this callback is only used for pincontrol */ gc->free = gpiochip_generic_free; } - gc->bgpio_data = gc->read_reg(gc->reg_dat); + chip->sdata = chip->read_reg(chip->reg_dat); if (gc->set == bgpio_set_set && !(flags & BGPIOF_UNREADABLE_REG_SET)) - gc->bgpio_data = gc->read_reg(gc->reg_set); + chip->sdata = chip->read_reg(chip->reg_set); if (flags & BGPIOF_UNREADABLE_REG_DIR) - gc->bgpio_dir_unreadable = true; + chip->dir_unreadable = true; /* * Inspect hardware to find initial direction setting. */ - if ((gc->reg_dir_out || gc->reg_dir_in) && + if ((chip->reg_dir_out || chip->reg_dir_in) && !(flags & BGPIOF_UNREADABLE_REG_DIR)) { - if (gc->reg_dir_out) - gc->bgpio_dir = gc->read_reg(gc->reg_dir_out); - else if (gc->reg_dir_in) - gc->bgpio_dir = ~gc->read_reg(gc->reg_dir_in); + if (chip->reg_dir_out) + chip->sdir = chip->read_reg(chip->reg_dir_out); + else if (chip->reg_dir_in) + chip->sdir = ~chip->read_reg(chip->reg_dir_in); /* * If we have two direction registers, synchronise * input setting to output setting, the library * can not handle a line being input and output at * the same time. */ - if (gc->reg_dir_out && gc->reg_dir_in) - gc->write_reg(gc->reg_dir_in, ~gc->bgpio_dir); + if (chip->reg_dir_out && chip->reg_dir_in) + chip->write_reg(chip->reg_dir_in, ~chip->sdir); } return ret; } -EXPORT_SYMBOL_GPL(bgpio_init); +EXPORT_SYMBOL_GPL(gpio_generic_chip_init); #if IS_ENABLED(CONFIG_GPIO_GENERIC_PLATFORM) diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index dd2cd2cc6e6f..a2a83afb41bb 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -71,7 +71,7 @@ static int mpc8572_gpio_get(struct gpio_chip *gc, unsigned int gpio) mpc8xxx_gc->regs + GPIO_DIR); val = gpio_generic_read_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_DAT) & ~out_mask; - out_shadow = gc->bgpio_data & out_mask; + out_shadow = mpc8xxx_gc->chip.sdata & out_mask; return !!((val | out_shadow) & mpc_pin2mask(gpio)); } @@ -399,7 +399,8 @@ static int mpc8xxx_probe(struct platform_device *pdev) gpio_generic_write_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_IBE, 0xffffffff); /* Also, latch state of GPIOs configured as output by bootloader. */ - gc->bgpio_data = gpio_generic_read_reg(&mpc8xxx_gc->chip, + mpc8xxx_gc->chip.sdata = + gpio_generic_read_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_DAT) & gpio_generic_read_reg(&mpc8xxx_gc->chip, mpc8xxx_gc->regs + GPIO_DIR); diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 9fcd4a988081..9b14fd20f13e 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -388,28 +388,6 @@ struct gpio_irq_chip { * implies that if the chip supports IRQs, these IRQs need to be threaded * as the chip access may sleep when e.g. reading out the IRQ status * registers. - * @read_reg: reader function for generic GPIO - * @write_reg: writer function for generic GPIO - * @be_bits: if the generic GPIO has big endian bit order (bit 31 is representing - * line 0, bit 30 is line 1 ... bit 0 is line 31) this is set to true by the - * generic GPIO core. It is for internal housekeeping only. - * @reg_dat: data (in) register for generic GPIO - * @reg_set: output set register (out=high) for generic GPIO - * @reg_clr: output clear register (out=low) for generic GPIO - * @reg_dir_out: direction out setting register for generic GPIO - * @reg_dir_in: direction in setting register for generic GPIO - * @bgpio_dir_unreadable: indicates that the direction register(s) cannot - * be read and we need to rely on out internal state tracking. - * @bgpio_pinctrl: the generic GPIO uses a pin control backend. - * @bgpio_bits: number of register bits used for a generic GPIO i.e. - * * 8 - * @bgpio_lock: used to lock chip->bgpio_data. Also, this is needed to keep - * shadowed and real data registers writes together. - * @bgpio_data: shadowed data register for generic GPIO to clear/set bits - * safely. - * @bgpio_dir: shadowed direction register for generic GPIO to clear/set - * direction safely. A "1" in this word means the line is set as - * output. * * A gpio_chip can help platforms abstract various sources of GPIOs so * they can all be accessed through a common programming interface. @@ -475,23 +453,6 @@ struct gpio_chip { const char *const *names; bool can_sleep; -#if IS_ENABLED(CONFIG_GPIO_GENERIC) - unsigned long (*read_reg)(void __iomem *reg); - void (*write_reg)(void __iomem *reg, unsigned long data); - bool be_bits; - void __iomem *reg_dat; - void __iomem *reg_set; - void __iomem *reg_clr; - void __iomem *reg_dir_out; - void __iomem *reg_dir_in; - bool bgpio_dir_unreadable; - bool bgpio_pinctrl; - int bgpio_bits; - raw_spinlock_t bgpio_lock; - unsigned long bgpio_data; - unsigned long bgpio_dir; -#endif /* CONFIG_GPIO_GENERIC */ - #ifdef CONFIG_GPIOLIB_IRQCHIP /* * With CONFIG_GPIOLIB_IRQCHIP we get an irqchip inside the gpiolib @@ -723,11 +684,6 @@ int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -int bgpio_init(struct gpio_chip *gc, struct device *dev, - unsigned long sz, void __iomem *dat, void __iomem *set, - void __iomem *clr, void __iomem *dirout, void __iomem *dirin, - unsigned long flags); - #define BGPIOF_BIG_ENDIAN BIT(0) #define BGPIOF_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ #define BGPIOF_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index 4c0626b53ec9..162430d96660 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -50,9 +50,44 @@ struct gpio_generic_chip_config { * struct gpio_generic_chip - Generic GPIO chip implementation. * @gc: The underlying struct gpio_chip object, implementing low-level GPIO * chip routines. + * @read_reg: reader function for generic GPIO + * @write_reg: writer function for generic GPIO + * @be_bits: if the generic GPIO has big endian bit order (bit 31 is + * representing line 0, bit 30 is line 1 ... bit 0 is line 31) this + * is set to true by the generic GPIO core. It is for internal + * housekeeping only. + * @reg_dat: data (in) register for generic GPIO + * @reg_set: output set register (out=high) for generic GPIO + * @reg_clr: output clear register (out=low) for generic GPIO + * @reg_dir_out: direction out setting register for generic GPIO + * @reg_dir_in: direction in setting register for generic GPIO + * @dir_unreadable: indicates that the direction register(s) cannot be read and + * we need to rely on out internal state tracking. + * @pinctrl: the generic GPIO uses a pin control backend. + * @bits: number of register bits used for a generic GPIO + * i.e. * 8 + * @lock: used to lock chip->sdata. Also, this is needed to keep + * shadowed and real data registers writes together. + * @sdata: shadowed data register for generic GPIO to clear/set bits safely. + * @sdir: shadowed direction register for generic GPIO to clear/set direction + * safely. A "1" in this word means the line is set as output. */ struct gpio_generic_chip { struct gpio_chip gc; + unsigned long (*read_reg)(void __iomem *reg); + void (*write_reg)(void __iomem *reg, unsigned long data); + bool be_bits; + void __iomem *reg_dat; + void __iomem *reg_set; + void __iomem *reg_clr; + void __iomem *reg_dir_out; + void __iomem *reg_dir_in; + bool dir_unreadable; + bool pinctrl; + int bits; + raw_spinlock_t lock; + unsigned long sdata; + unsigned long sdir; }; static inline struct gpio_generic_chip * @@ -61,20 +96,8 @@ to_gpio_generic_chip(struct gpio_chip *gc) return container_of(gc, struct gpio_generic_chip, gc); } -/** - * gpio_generic_chip_init() - Initialize a generic GPIO chip. - * @chip: Generic GPIO chip to set up. - * @cfg: Generic GPIO chip configuration. - * - * Returns 0 on success, negative error number on failure. - */ -static inline int -gpio_generic_chip_init(struct gpio_generic_chip *chip, - const struct gpio_generic_chip_config *cfg) -{ - return bgpio_init(&chip->gc, cfg->dev, cfg->sz, cfg->dat, cfg->set, - cfg->clr, cfg->dirout, cfg->dirin, cfg->flags); -} +int gpio_generic_chip_init(struct gpio_generic_chip *chip, + const struct gpio_generic_chip_config *cfg); /** * gpio_generic_chip_set() - Set the GPIO line value of the generic GPIO chip. @@ -110,10 +133,10 @@ gpio_generic_chip_set(struct gpio_generic_chip *chip, unsigned int offset, static inline unsigned long gpio_generic_read_reg(struct gpio_generic_chip *chip, void __iomem *reg) { - if (WARN_ON(!chip->gc.read_reg)) + if (WARN_ON(!chip->read_reg)) return 0; - return chip->gc.read_reg(reg); + return chip->read_reg(reg); } /** @@ -125,23 +148,23 @@ gpio_generic_read_reg(struct gpio_generic_chip *chip, void __iomem *reg) static inline void gpio_generic_write_reg(struct gpio_generic_chip *chip, void __iomem *reg, unsigned long val) { - if (WARN_ON(!chip->gc.write_reg)) + if (WARN_ON(!chip->write_reg)) return; - chip->gc.write_reg(reg, val); + chip->write_reg(reg, val); } #define gpio_generic_chip_lock(gen_gc) \ - raw_spin_lock(&(gen_gc)->gc.bgpio_lock) + raw_spin_lock(&(gen_gc)->lock) #define gpio_generic_chip_unlock(gen_gc) \ - raw_spin_unlock(&(gen_gc)->gc.bgpio_lock) + raw_spin_unlock(&(gen_gc)->lock) #define gpio_generic_chip_lock_irqsave(gen_gc, flags) \ - raw_spin_lock_irqsave(&(gen_gc)->gc.bgpio_lock, flags) + raw_spin_lock_irqsave(&(gen_gc)->lock, flags) #define gpio_generic_chip_unlock_irqrestore(gen_gc, flags) \ - raw_spin_unlock_irqrestore(&(gen_gc)->gc.bgpio_lock, flags) + raw_spin_unlock_irqrestore(&(gen_gc)->lock, flags) DEFINE_LOCK_GUARD_1(gpio_generic_lock, struct gpio_generic_chip, -- cgit v1.2.3 From 121a0f839dbb397af5fabb701cea3e9983223e50 Mon Sep 17 00:00:00 2001 From: Israel Cepeda Date: Thu, 11 Sep 2025 20:13:41 +0200 Subject: usb: misc: Add Intel USBIO bridge driver Add a driver for the Intel USBIO USB IO-expander used by the MIPI cameras on various new (Meteor Lake and later) Intel laptops. This is an USB bridge driver which adds auxbus child devices for the GPIO, I2C and SPI functions of the USBIO chip and which exports IO-functions for the drivers for the auxbus child devices to communicate with the USBIO device's firmware. Co-developed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Israel Cepeda Link: https://lore.kernel.org/r/20250911181343.77398-2-hansg@kernel.org Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 8 + drivers/usb/misc/Kconfig | 14 + drivers/usb/misc/Makefile | 1 + drivers/usb/misc/usbio.c | 749 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/usb/usbio.h | 177 +++++++++++ 5 files changed, 949 insertions(+) create mode 100644 drivers/usb/misc/usbio.c create mode 100644 include/linux/usb/usbio.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index fed6cd812d79..434ac1593332 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12688,6 +12688,14 @@ S: Maintained F: Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst F: drivers/platform/x86/intel/uncore-frequency/ +INTEL USBIO USB I/O EXPANDER DRIVERS +M: Israel Cepeda +M: Hans de Goede +R: Sakari Ailus +S: Maintained +F: drivers/usb/misc/usbio.c +F: include/linux/usb/usbio.h + INTEL VENDOR SPECIFIC EXTENDED CAPABILITIES DRIVER M: David E. Box S: Supported diff --git a/drivers/usb/misc/Kconfig b/drivers/usb/misc/Kconfig index 26eaae32f738..09ac6f1c985f 100644 --- a/drivers/usb/misc/Kconfig +++ b/drivers/usb/misc/Kconfig @@ -179,6 +179,20 @@ config USB_LJCA This driver can also be built as a module. If so, the module will be called usb-ljca. +config USB_USBIO + tristate "Intel USBIO Bridge support" + depends on USB && ACPI + select AUXILIARY_BUS + help + This adds support for Intel USBIO drivers. + This enables the USBIO bridge driver module in charge to talk + to the USB device. Additional drivers such as GPIO_USBIO and + I2C_USBIO must be enabled in order to use the device's full + functionality. + + This driver can also be built as a module. If so, the module + will be called usbio. + source "drivers/usb/misc/sisusbvga/Kconfig" config USB_LD diff --git a/drivers/usb/misc/Makefile b/drivers/usb/misc/Makefile index 0cd5bc8f52fe..494ab0377f35 100644 --- a/drivers/usb/misc/Makefile +++ b/drivers/usb/misc/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_USB_EMI62) += emi62.o obj-$(CONFIG_USB_EZUSB_FX2) += ezusb.o obj-$(CONFIG_APPLE_MFI_FASTCHARGE) += apple-mfi-fastcharge.o obj-$(CONFIG_USB_LJCA) += usb-ljca.o +obj-$(CONFIG_USB_USBIO) += usbio.o obj-$(CONFIG_USB_IDMOUSE) += idmouse.o obj-$(CONFIG_USB_IOWARRIOR) += iowarrior.o obj-$(CONFIG_USB_ISIGHTFW) += isight_firmware.o diff --git a/drivers/usb/misc/usbio.c b/drivers/usb/misc/usbio.c new file mode 100644 index 000000000000..37644dddf157 --- /dev/null +++ b/drivers/usb/misc/usbio.c @@ -0,0 +1,749 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel USBIO Bridge driver + * + * Copyright (c) 2025 Intel Corporation. + * Copyright (c) 2025 Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/************************************* + * USBIO Bridge Protocol Definitions * + *************************************/ + +/* USBIO Control Commands */ +#define USBIO_CTRLCMD_PROTVER 0 +#define USBIO_CTRLCMD_FWVER 1 +#define USBIO_CTRLCMD_HS 2 +#define USBIO_CTRLCMD_ENUMGPIO 16 +#define USBIO_CTRLCMD_ENUMI2C 17 + +/* USBIO Packet Flags */ +#define USBIO_PKTFLAG_ACK BIT(0) +#define USBIO_PKTFLAG_RSP BIT(1) +#define USBIO_PKTFLAG_CMP BIT(2) +#define USBIO_PKTFLAG_ERR BIT(3) + +#define USBIO_PKTFLAGS_REQRESP (USBIO_PKTFLAG_CMP | USBIO_PKTFLAG_ACK) + +#define USBIO_CTRLXFER_TIMEOUT 0 +#define USBIO_BULKXFER_TIMEOUT 100 + +struct usbio_protver { + u8 ver; +} __packed; + +struct usbio_fwver { + u8 major; + u8 minor; + __le16 patch; + __le16 build; +} __packed; + +/*********************************** + * USBIO Bridge Device Definitions * + ***********************************/ + +/** + * struct usbio_device - the usb device exposing IOs + * + * @dev: the device in the usb interface + * @udev: the detected usb device + * @intf: the usb interface + * @quirks: quirks + * @ctrl_mutex: protects ctrl_buf + * @ctrl_pipe: the control transfer pipe + * @ctrlbuf_len: the size of the control transfer pipe + * @ctrlbuf: the buffer used for control transfers + * @bulk_mutex: protects tx_buf, rx_buf and split bulk-transfers getting interrupted + * @tx_pipe: the bulk out pipe + * @txbuf_len: the size of the bulk out pipe + * @txbuf: the buffer used for bulk out transfers + * @rx_pipe: the bulk in pipe + * @rxbuf_len: the size of the bulk in pipe + * @rxdat_len: the data length at rx buffer + * @rxbuf: the buffer used for bulk in transfers + * @urb: the urb to read bulk pipe + * @done: completion object as request is done + * @cli_list: device's client list + * @nr_gpio_banks: Number of GPIO banks + * @gpios: GPIO bank descriptors + * @nr_gpio_banks: Number of I2C busses + * @gpios: I2C bank descriptors + */ +struct usbio_device { + struct device *dev; + struct usb_device *udev; + struct usb_interface *intf; + unsigned long quirks; + + struct mutex ctrl_mutex; + unsigned int ctrl_pipe; + u16 ctrlbuf_len; + void *ctrlbuf; + + struct mutex bulk_mutex; + unsigned int tx_pipe; + u16 txbuf_len; + void *txbuf; + + unsigned int rx_pipe; + u16 rxbuf_len; + u16 rxdat_len; + void *rxbuf; + struct urb *urb; + + struct completion done; + + struct list_head cli_list; + + unsigned int nr_gpio_banks; + struct usbio_gpio_bank_desc gpios[USBIO_MAX_GPIOBANKS]; + + unsigned int nr_i2c_buses; + struct usbio_i2c_bus_desc i2cs[USBIO_MAX_I2CBUSES]; +}; + +/** + * struct usbio_client - represents a usbio client + * + * @auxdev: auxiliary device object + * @mutex: protects @bridge + * @bridge: usbio bridge who service the client + * @link: usbio bridge clients list member + */ +struct usbio_client { + struct auxiliary_device auxdev; + struct mutex mutex; + struct usbio_device *bridge; + struct list_head link; +}; + +#define adev_to_client(adev) container_of_const(adev, struct usbio_client, auxdev) + +static int usbio_ctrl_msg(struct usbio_device *usbio, u8 type, u8 cmd, + const void *obuf, u16 obuf_len, void *ibuf, u16 ibuf_len) +{ + u8 request = USB_TYPE_VENDOR | USB_RECIP_DEVICE; + struct usbio_ctrl_packet *cpkt; + unsigned int pipe; + u16 cpkt_len; + int ret; + + lockdep_assert_held(&usbio->ctrl_mutex); + + if ((obuf_len > (usbio->ctrlbuf_len - sizeof(*cpkt))) || + (ibuf_len > (usbio->ctrlbuf_len - sizeof(*cpkt)))) + return -EMSGSIZE; + + /* Prepare Control Packet Header */ + cpkt = usbio->ctrlbuf; + cpkt->header.type = type; + cpkt->header.cmd = cmd; + if (type == USBIO_PKTTYPE_CTRL || ibuf_len) + cpkt->header.flags = USBIO_PKTFLAGS_REQRESP; + else + cpkt->header.flags = USBIO_PKTFLAG_CMP; + cpkt->len = obuf_len; + + /* Copy the data */ + memcpy(cpkt->data, obuf, obuf_len); + + pipe = usb_sndctrlpipe(usbio->udev, usbio->ctrl_pipe); + cpkt_len = sizeof(*cpkt) + obuf_len; + ret = usb_control_msg(usbio->udev, pipe, 0, request | USB_DIR_OUT, 0, 0, + cpkt, cpkt_len, USBIO_CTRLXFER_TIMEOUT); + dev_dbg(usbio->dev, "control out %d hdr %*phN data %*phN\n", ret, + (int)sizeof(*cpkt), cpkt, (int)cpkt->len, cpkt->data); + + if (ret != cpkt_len) { + dev_err(usbio->dev, "USB control out failed: %d\n", ret); + return (ret < 0) ? ret : -EPROTO; + } + + if (!(cpkt->header.flags & USBIO_PKTFLAG_ACK)) + return 0; + + pipe = usb_rcvctrlpipe(usbio->udev, usbio->ctrl_pipe); + cpkt_len = sizeof(*cpkt) + ibuf_len; + ret = usb_control_msg(usbio->udev, pipe, 0, request | USB_DIR_IN, 0, 0, + cpkt, cpkt_len, USBIO_CTRLXFER_TIMEOUT); + dev_dbg(usbio->dev, "control in %d hdr %*phN data %*phN\n", ret, + (int)sizeof(*cpkt), cpkt, (int)cpkt->len, cpkt->data); + + if (ret < sizeof(*cpkt)) { + dev_err(usbio->dev, "USB control in failed: %d\n", ret); + return (ret < 0) ? ret : -EPROTO; + } + + if (cpkt->header.type != type || cpkt->header.cmd != cmd || + !(cpkt->header.flags & USBIO_PKTFLAG_RSP)) { + dev_err(usbio->dev, "Unexpected reply type: %u, cmd: %u, flags: %u\n", + cpkt->header.type, cpkt->header.cmd, cpkt->header.flags); + return -EPROTO; + } + + if (cpkt->header.flags & USBIO_PKTFLAG_ERR) + return -EREMOTEIO; + + if (ibuf_len < cpkt->len) + return -ENOSPC; + + memcpy(ibuf, cpkt->data, cpkt->len); + + return cpkt->len; +} + +int usbio_control_msg(struct auxiliary_device *adev, u8 type, u8 cmd, + const void *obuf, u16 obuf_len, void *ibuf, u16 ibuf_len) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio; + int ret; + + guard(mutex)(&client->mutex); + + usbio = client->bridge; + if (!usbio) + return -ENODEV; /* Disconnected */ + + ret = usb_autopm_get_interface(usbio->intf); + if (ret) + return ret; + + mutex_lock(&usbio->ctrl_mutex); + + ret = usbio_ctrl_msg(client->bridge, type, cmd, obuf, obuf_len, ibuf, ibuf_len); + + mutex_unlock(&usbio->ctrl_mutex); + usb_autopm_put_interface(usbio->intf); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(usbio_control_msg, "USBIO"); + +static void usbio_bulk_recv(struct urb *urb) +{ + struct usbio_bulk_packet *bpkt = urb->transfer_buffer; + struct usbio_device *usbio = urb->context; + + if (!urb->status) { + if (bpkt->header.flags & USBIO_PKTFLAG_RSP) { + usbio->rxdat_len = urb->actual_length; + complete(&usbio->done); + } + } else if (urb->status != -ENOENT) { + dev_err(usbio->dev, "Bulk in error %d\n", urb->status); + } + + usb_submit_urb(usbio->urb, GFP_ATOMIC); +} + +int usbio_bulk_msg(struct auxiliary_device *adev, u8 type, u8 cmd, bool last, + const void *obuf, u16 obuf_len, void *ibuf, u16 ibuf_len) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio = client->bridge; + struct usbio_bulk_packet *bpkt; + int ret, act = 0; + u16 bpkt_len; + + lockdep_assert_held(&client->mutex); + lockdep_assert_held(&usbio->bulk_mutex); + + if ((obuf_len > (usbio->txbuf_len - sizeof(*bpkt))) || + (ibuf_len > (usbio->txbuf_len - sizeof(*bpkt)))) + return -EMSGSIZE; + + if (ibuf_len) + reinit_completion(&usbio->done); + + /* If no data to send, skip to read */ + if (!obuf_len) + goto read; + + /* Prepare Bulk Packet Header */ + bpkt = usbio->txbuf; + bpkt->header.type = type; + bpkt->header.cmd = cmd; + if (!last) + bpkt->header.flags = 0; + else if (ibuf_len) + bpkt->header.flags = USBIO_PKTFLAGS_REQRESP; + else + bpkt->header.flags = USBIO_PKTFLAG_CMP; + bpkt->len = cpu_to_le16(obuf_len); + + /* Copy the data */ + memcpy(bpkt->data, obuf, obuf_len); + + bpkt_len = sizeof(*bpkt) + obuf_len; + ret = usb_bulk_msg(usbio->udev, usbio->tx_pipe, bpkt, bpkt_len, &act, + USBIO_BULKXFER_TIMEOUT); + dev_dbg(usbio->dev, "bulk out %d hdr %*phN data %*phN\n", act, + (int)sizeof(*bpkt), bpkt, obuf_len, bpkt->data); + + if (ret || act != bpkt_len) { + dev_err(usbio->dev, "Bulk out failed: %d\n", ret); + return ret ?: -EPROTO; + } + + if (!(bpkt->header.flags & USBIO_PKTFLAG_ACK)) + return obuf_len; + +read: + ret = wait_for_completion_timeout(&usbio->done, USBIO_BULKXFER_TIMEOUT); + if (ret <= 0) { + dev_err(usbio->dev, "Bulk in wait failed: %d\n", ret); + return ret ?: -ETIMEDOUT; + } + + act = usbio->rxdat_len; + bpkt = usbio->rxbuf; + bpkt_len = le16_to_cpu(bpkt->len); + dev_dbg(usbio->dev, "bulk in %d hdr %*phN data %*phN\n", act, + (int)sizeof(*bpkt), bpkt, bpkt_len, bpkt->data); + + /* + * Unsupported bulk commands get only an usbio_packet_header with + * the error flag set as reply. Return -EPIPE for this case. + */ + if (act == sizeof(struct usbio_packet_header) && + (bpkt->header.flags & USBIO_PKTFLAG_ERR)) + return -EPIPE; + + if (act < sizeof(*bpkt)) { + dev_err(usbio->dev, "Bulk in short read: %d\n", act); + return -EPROTO; + } + + if (bpkt->header.type != type || bpkt->header.cmd != cmd || + !(bpkt->header.flags & USBIO_PKTFLAG_RSP)) { + dev_err(usbio->dev, + "Unexpected bulk in type 0x%02x cmd 0x%02x flags 0x%02x\n", + bpkt->header.type, bpkt->header.cmd, bpkt->header.flags); + return -EPROTO; + } + + if (bpkt->header.flags & USBIO_PKTFLAG_ERR) + return -EREMOTEIO; + + if (ibuf_len < bpkt_len) + return -ENOSPC; + + memcpy(ibuf, bpkt->data, bpkt_len); + + return bpkt_len; +} +EXPORT_SYMBOL_NS_GPL(usbio_bulk_msg, "USBIO"); + +int usbio_acquire(struct auxiliary_device *adev) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio; + int ret; + + mutex_lock(&client->mutex); + + usbio = client->bridge; + if (!usbio) { + ret = -ENODEV; /* Disconnected */ + goto err_unlock; + } + + ret = usb_autopm_get_interface(usbio->intf); + if (ret) + goto err_unlock; + + mutex_lock(&usbio->bulk_mutex); + + /* Leave client locked until release to avoid abba deadlock issues */ + return 0; + +err_unlock: + mutex_unlock(&client->mutex); + + return ret; +} +EXPORT_SYMBOL_NS_GPL(usbio_acquire, "USBIO"); + +void usbio_release(struct auxiliary_device *adev) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio = client->bridge; + + lockdep_assert_held(&client->mutex); + + mutex_unlock(&usbio->bulk_mutex); + usb_autopm_put_interface(usbio->intf); + mutex_unlock(&client->mutex); +} +EXPORT_SYMBOL_NS_GPL(usbio_release, "USBIO"); + +void usbio_get_txrxbuf_len(struct auxiliary_device *adev, u16 *txbuf_len, u16 *rxbuf_len) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio; + + guard(mutex)(&client->mutex); + + usbio = client->bridge; + if (!usbio) + return; /* Disconnected */ + + *txbuf_len = usbio->txbuf_len; + *rxbuf_len = usbio->rxbuf_len; +} +EXPORT_SYMBOL_NS_GPL(usbio_get_txrxbuf_len, "USBIO"); + +unsigned long usbio_get_quirks(struct auxiliary_device *adev) +{ + struct usbio_client *client = adev_to_client(adev); + struct usbio_device *usbio; + + guard(mutex)(&client->mutex); + + usbio = client->bridge; + if (!usbio) + return 0; /* Disconnected */ + + return usbio->quirks; +} +EXPORT_SYMBOL_NS_GPL(usbio_get_quirks, "USBIO"); + +static void usbio_auxdev_release(struct device *dev) +{ + struct auxiliary_device *adev = to_auxiliary_dev(dev); + struct usbio_client *client = adev_to_client(adev); + + mutex_destroy(&client->mutex); + kfree(client); +} + +static int usbio_add_client(struct usbio_device *usbio, char *name, u8 id, void *data) +{ + struct usbio_client *client; + struct auxiliary_device *adev; + int ret; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (!client) + return -ENOMEM; + + mutex_init(&client->mutex); + client->bridge = usbio; + adev = &client->auxdev; + adev->name = name; + adev->id = id; + + adev->dev.parent = usbio->dev; + adev->dev.platform_data = data; + adev->dev.release = usbio_auxdev_release; + + ret = auxiliary_device_init(adev); + if (ret) { + usbio_auxdev_release(&adev->dev); + return ret; + } + + ret = auxiliary_device_add(adev); + if (ret) { + auxiliary_device_uninit(adev); + return ret; + } + + list_add_tail(&client->link, &usbio->cli_list); + + return 0; +} + +static int usbio_enum_gpios(struct usbio_device *usbio) +{ + struct usbio_gpio_bank_desc *gpio = usbio->gpios; + + dev_dbg(usbio->dev, "GPIO Banks: %d\n", usbio->nr_gpio_banks); + + for (unsigned int i = 0; i < usbio->nr_gpio_banks; i++) + dev_dbg(usbio->dev, "\tBank%d[%d] map: %#08x\n", + gpio[i].id, gpio[i].pins, gpio[i].bmap); + + usbio_add_client(usbio, USBIO_GPIO_CLIENT, 0, gpio); + + return 0; +} + +static int usbio_enum_i2cs(struct usbio_device *usbio) +{ + struct usbio_i2c_bus_desc *i2c = usbio->i2cs; + + dev_dbg(usbio->dev, "I2C Busses: %d\n", usbio->nr_i2c_buses); + + for (unsigned int i = 0; i < usbio->nr_i2c_buses; i++) { + dev_dbg(usbio->dev, "\tBus%d caps: %#02x\n", i2c[i].id, i2c[i].caps); + usbio_add_client(usbio, USBIO_I2C_CLIENT, i, &i2c[i]); + } + + return 0; +} + +static int usbio_suspend(struct usb_interface *intf, pm_message_t msg) +{ + struct usbio_device *usbio = usb_get_intfdata(intf); + + usb_kill_urb(usbio->urb); + + return 0; +} + +static int usbio_resume(struct usb_interface *intf) +{ + struct usbio_device *usbio = usb_get_intfdata(intf); + + return usb_submit_urb(usbio->urb, GFP_KERNEL); +} + +static void usbio_disconnect(struct usb_interface *intf) +{ + struct usbio_device *usbio = usb_get_intfdata(intf); + struct usbio_client *client; + + /* Wakeup any clients waiting for a reply */ + usbio->rxdat_len = 0; + complete(&usbio->done); + + /* Let clients know the bridge is gone */ + list_for_each_entry(client, &usbio->cli_list, link) { + mutex_lock(&client->mutex); + client->bridge = NULL; + mutex_unlock(&client->mutex); + } + + /* From here on clients will no longer touch struct usbio_device */ + usb_kill_urb(usbio->urb); + usb_free_urb(usbio->urb); + + list_for_each_entry_reverse(client, &usbio->cli_list, link) { + auxiliary_device_delete(&client->auxdev); + auxiliary_device_uninit(&client->auxdev); + } +} + +static int usbio_probe(struct usb_interface *intf, const struct usb_device_id *id) +{ + struct usb_device *udev = interface_to_usbdev(intf); + struct usb_endpoint_descriptor *ep_in, *ep_out; + struct device *dev = &intf->dev; + struct usbio_protver protver; + struct usbio_device *usbio; + struct usbio_fwver fwver; + int ret; + + usbio = devm_kzalloc(dev, sizeof(*usbio), GFP_KERNEL); + if (!usbio) + return -ENOMEM; + + ret = devm_mutex_init(dev, &usbio->ctrl_mutex); + if (ret) + return ret; + + ret = devm_mutex_init(dev, &usbio->bulk_mutex); + if (ret) + return ret; + + usbio->dev = dev; + usbio->udev = udev; + usbio->intf = intf; + usbio->quirks = id ? id->driver_info : 0; + init_completion(&usbio->done); + INIT_LIST_HEAD(&usbio->cli_list); + usb_set_intfdata(intf, usbio); + + usbio->ctrl_pipe = usb_endpoint_num(&udev->ep0.desc); + usbio->ctrlbuf_len = usb_maxpacket(udev, usbio->ctrl_pipe); + usbio->ctrlbuf = devm_kzalloc(dev, usbio->ctrlbuf_len, GFP_KERNEL); + if (!usbio->ctrlbuf) + return -ENOMEM; + + /* Find the first bulk-in and bulk-out endpoints */ + ret = usb_find_common_endpoints(intf->cur_altsetting, &ep_in, &ep_out, + NULL, NULL); + if (ret) { + dev_err(dev, "Cannot find bulk endpoints: %d\n", ret); + return ret; + } + + usbio->tx_pipe = usb_sndbulkpipe(udev, usb_endpoint_num(ep_out)); + + if (usbio->quirks & USBIO_QUIRK_BULK_MAXP_63) + usbio->txbuf_len = 63; + else + usbio->txbuf_len = usb_endpoint_maxp(ep_out); + + usbio->txbuf = devm_kzalloc(dev, usbio->txbuf_len, GFP_KERNEL); + if (!usbio->txbuf) + return -ENOMEM; + + usbio->rx_pipe = usb_rcvbulkpipe(udev, usb_endpoint_num(ep_in)); + + if (usbio->quirks & USBIO_QUIRK_BULK_MAXP_63) + usbio->rxbuf_len = 63; + else + usbio->rxbuf_len = usb_endpoint_maxp(ep_in); + + usbio->rxbuf = devm_kzalloc(dev, usbio->rxbuf_len, GFP_KERNEL); + if (!usbio->rxbuf) + return -ENOMEM; + + usbio->urb = usb_alloc_urb(0, GFP_KERNEL); + if (!usbio->urb) + return -ENOMEM; + + usb_fill_bulk_urb(usbio->urb, udev, usbio->rx_pipe, usbio->rxbuf, + usbio->rxbuf_len, usbio_bulk_recv, usbio); + ret = usb_submit_urb(usbio->urb, GFP_KERNEL); + if (ret) + return dev_err_probe(dev, ret, "Submitting usb urb\n"); + + mutex_lock(&usbio->ctrl_mutex); + + ret = usbio_ctrl_msg(usbio, USBIO_PKTTYPE_CTRL, USBIO_CTRLCMD_HS, NULL, 0, NULL, 0); + if (ret < 0) + goto err_unlock; + + ret = usbio_ctrl_msg(usbio, USBIO_PKTTYPE_CTRL, USBIO_CTRLCMD_PROTVER, NULL, 0, + &protver, sizeof(protver)); + if (ret < 0) + goto err_unlock; + + ret = usbio_ctrl_msg(usbio, USBIO_PKTTYPE_CTRL, USBIO_CTRLCMD_FWVER, NULL, 0, + &fwver, sizeof(fwver)); + if (ret < 0) + goto err_unlock; + + ret = usbio_ctrl_msg(usbio, USBIO_PKTTYPE_CTRL, USBIO_CTRLCMD_ENUMGPIO, NULL, 0, + usbio->gpios, sizeof(usbio->gpios)); + if (ret < 0 || ret % sizeof(struct usbio_gpio_bank_desc)) { + ret = (ret < 0) ? ret : -EPROTO; + goto err_unlock; + } + usbio->nr_gpio_banks = ret / sizeof(struct usbio_gpio_bank_desc); + + ret = usbio_ctrl_msg(usbio, USBIO_PKTTYPE_CTRL, USBIO_CTRLCMD_ENUMI2C, NULL, 0, + usbio->i2cs, sizeof(usbio->i2cs)); + if (ret < 0 || ret % sizeof(struct usbio_i2c_bus_desc)) { + ret = (ret < 0) ? ret : -EPROTO; + goto err_unlock; + } + usbio->nr_i2c_buses = ret / sizeof(struct usbio_i2c_bus_desc); + + mutex_unlock(&usbio->ctrl_mutex); + + dev_dbg(dev, "ProtVer(BCD): %02x FwVer: %d.%d.%d.%d\n", + protver.ver, fwver.major, fwver.minor, + le16_to_cpu(fwver.patch), le16_to_cpu(fwver.build)); + + usbio_enum_gpios(usbio); + usbio_enum_i2cs(usbio); + + return 0; + +err_unlock: + mutex_unlock(&usbio->ctrl_mutex); + usb_kill_urb(usbio->urb); + usb_free_urb(usbio->urb); + + return ret; +} + +static const struct usb_device_id usbio_table[] = { + { USB_DEVICE(0x2ac1, 0x20c1), /* Lattice NX40 */ + .driver_info = USBIO_QUIRK_I2C_MAX_RW_LEN_52 }, + { USB_DEVICE(0x2ac1, 0x20c9), /* Lattice NX33 */ + .driver_info = USBIO_QUIRK_I2C_NO_INIT_ACK | USBIO_QUIRK_I2C_MAX_RW_LEN_52 | + USBIO_QUIRK_I2C_ALLOW_400KHZ }, + { USB_DEVICE(0x2ac1, 0x20cb) }, /* Lattice NX33U */ + { USB_DEVICE(0x06cb, 0x0701), /* Synaptics Sabre */ + .driver_info = USBIO_QUIRK_BULK_MAXP_63 | USBIO_QUIRK_I2C_USE_CHUNK_LEN }, + { } +}; +MODULE_DEVICE_TABLE(usb, usbio_table); + +static struct usb_driver usbio_driver = { + .name = "usbio-bridge", + .probe = usbio_probe, + .disconnect = usbio_disconnect, + .suspend = usbio_suspend, + .resume = usbio_resume, + .id_table = usbio_table, + .supports_autosuspend = 1, +}; +module_usb_driver(usbio_driver); + +struct usbio_match_ids_walk_data { + struct acpi_device *adev; + const struct acpi_device_id *hids; + unsigned int id; +}; + +static int usbio_match_device_ids(struct acpi_device *adev, void *data) +{ + struct usbio_match_ids_walk_data *wd = data; + unsigned int id = 0; + char *uid; + + if (acpi_match_device_ids(adev, wd->hids)) + return 0; + + uid = acpi_device_uid(adev); + if (uid) { + for (int i = 0; i < strlen(uid); i++) { + if (!kstrtouint(&uid[i], 10, &id)) + break; + } + } + + if (!uid || wd->id == id) { + wd->adev = adev; + return 1; + } + + return 0; +} + +void usbio_acpi_bind(struct auxiliary_device *adev, const struct acpi_device_id *hids) +{ + struct device *dev = &adev->dev; + struct acpi_device *parent; + struct usbio_match_ids_walk_data wd = { + .adev = NULL, + .hids = hids, + .id = adev->id, + }; + + parent = ACPI_COMPANION(dev->parent); + if (!parent) + return; + + acpi_dev_for_each_child(parent, usbio_match_device_ids, &wd); + if (wd.adev) + ACPI_COMPANION_SET(dev, wd.adev); +} +EXPORT_SYMBOL_NS_GPL(usbio_acpi_bind, "USBIO"); + +MODULE_DESCRIPTION("Intel USBIO Bridge driver"); +MODULE_AUTHOR("Israel Cepeda "); +MODULE_AUTHOR("Hans de Goede "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/usb/usbio.h b/include/linux/usb/usbio.h new file mode 100644 index 000000000000..6c4e7c246d58 --- /dev/null +++ b/include/linux/usb/usbio.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2025 Intel Corporation. + * + */ + +#ifndef _LINUX_USBIO_H_ +#define _LINUX_USBIO_H_ + +#include +#include +#include +#include + +/*********************** + * USBIO Clients Names * + ***********************/ +#define USBIO_GPIO_CLIENT "usbio-gpio" +#define USBIO_I2C_CLIENT "usbio-i2c" + +/**************** + * USBIO quirks * + ****************/ +#define USBIO_QUIRK_BULK_MAXP_63 BIT(0) /* Force bulk endpoint maxp to 63 */ +#define USBIO_QUIRK_I2C_NO_INIT_ACK BIT(8) /* Do not ask for ack on I2C init */ +#define USBIO_QUIRK_I2C_MAX_RW_LEN_52 BIT(9) /* Set i2c-adapter max r/w len to 52 */ +#define USBIO_QUIRK_I2C_USE_CHUNK_LEN BIT(10) /* Send chunk-len for split xfers */ +#define USBIO_QUIRK_I2C_ALLOW_400KHZ BIT(11) /* Override desc, allowing 400 KHz */ + +/************************** + * USBIO Type Definitions * + **************************/ + +/* USBIO Packet Type */ +#define USBIO_PKTTYPE_CTRL 1 +#define USBIO_PKTTYPE_DBG 2 +#define USBIO_PKTTYPE_GPIO 3 +#define USBIO_PKTTYPE_I2C 4 + +/* USBIO Packet Header */ +struct usbio_packet_header { + u8 type; + u8 cmd; + u8 flags; +} __packed; + +/* USBIO Control Transfer Packet */ +struct usbio_ctrl_packet { + struct usbio_packet_header header; + u8 len; + u8 data[] __counted_by(len); +} __packed; + +/* USBIO Bulk Transfer Packet */ +struct usbio_bulk_packet { + struct usbio_packet_header header; + __le16 len; + u8 data[] __counted_by(len); +} __packed; + +/* USBIO GPIO commands */ +enum usbio_gpio_cmd { + USBIO_GPIOCMD_DEINIT, + USBIO_GPIOCMD_INIT, + USBIO_GPIOCMD_READ, + USBIO_GPIOCMD_WRITE, + USBIO_GPIOCMD_END +}; + +/* USBIO GPIO config */ +enum usbio_gpio_pincfg { + USBIO_GPIO_PINCFG_DEFAULT, + USBIO_GPIO_PINCFG_PULLUP, + USBIO_GPIO_PINCFG_PULLDOWN, + USBIO_GPIO_PINCFG_PUSHPULL +}; + +#define USBIO_GPIO_PINCFG_SHIFT 2 +#define USBIO_GPIO_PINCFG_MASK (0x3 << USBIO_GPIO_PINCFG_SHIFT) +#define USBIO_GPIO_SET_PINCFG(pincfg) \ + (((pincfg) << USBIO_GPIO_PINCFG_SHIFT) & USBIO_GPIO_PINCFG_MASK) + +enum usbio_gpio_pinmode { + USBIO_GPIO_PINMOD_INVAL, + USBIO_GPIO_PINMOD_INPUT, + USBIO_GPIO_PINMOD_OUTPUT, + USBIO_GPIO_PINMOD_MAXVAL +}; + +#define USBIO_GPIO_PINMOD_MASK 0x3 +#define USBIO_GPIO_SET_PINMOD(pin) (pin & USBIO_GPIO_PINMOD_MASK) + +/************************* + * USBIO GPIO Controller * + *************************/ + +#define USBIO_MAX_GPIOBANKS 5 +#define USBIO_GPIOSPERBANK 32 + +struct usbio_gpio_bank_desc { + u8 id; + u8 pins; + __le32 bmap; +} __packed; + +struct usbio_gpio_init { + u8 bankid; + u8 config; + u8 pincount; + u8 pin; +} __packed; + +struct usbio_gpio_rw { + u8 bankid; + u8 pincount; + u8 pin; + __le32 value; +} __packed; + +/* USBIO I2C commands */ +enum usbio_i2c_cmd { + USBIO_I2CCMD_UNINIT, + USBIO_I2CCMD_INIT, + USBIO_I2CCMD_READ, + USBIO_I2CCMD_WRITE, + USBIO_I2CCMD_END +}; + +/************************ + * USBIO I2C Controller * + ************************/ + +#define USBIO_MAX_I2CBUSES 5 + +#define USBIO_I2C_BUS_ADDR_CAP_10B BIT(3) /* 10bit address support */ +#define USBIO_I2C_BUS_MODE_CAP_MASK 0x3 +#define USBIO_I2C_BUS_MODE_CAP_SM 0 /* Standard Mode */ +#define USBIO_I2C_BUS_MODE_CAP_FM 1 /* Fast Mode */ +#define USBIO_I2C_BUS_MODE_CAP_FMP 2 /* Fast Mode+ */ +#define USBIO_I2C_BUS_MODE_CAP_HSM 3 /* High-Speed Mode */ + +struct usbio_i2c_bus_desc { + u8 id; + u8 caps; +} __packed; + +struct usbio_i2c_uninit { + u8 busid; + __le16 config; +} __packed; + +struct usbio_i2c_init { + u8 busid; + __le16 config; + __le32 speed; +} __packed; + +struct usbio_i2c_rw { + u8 busid; + __le16 config; + __le16 size; + u8 data[] __counted_by(size); +} __packed; + +int usbio_control_msg(struct auxiliary_device *adev, u8 type, u8 cmd, + const void *obuf, u16 obuf_len, void *ibuf, u16 ibuf_len); + +int usbio_bulk_msg(struct auxiliary_device *adev, u8 type, u8 cmd, bool last, + const void *obuf, u16 obuf_len, void *ibuf, u16 ibuf_len); + +int usbio_acquire(struct auxiliary_device *adev); +void usbio_release(struct auxiliary_device *adev); +void usbio_get_txrxbuf_len(struct auxiliary_device *adev, u16 *txbuf_len, u16 *rxbuf_len); +unsigned long usbio_get_quirks(struct auxiliary_device *adev); +void usbio_acpi_bind(struct auxiliary_device *adev, const struct acpi_device_id *hids); + +#endif -- cgit v1.2.3 From 7f70b89b2be66c03ddc76d3ad8aebeeec4a9c505 Mon Sep 17 00:00:00 2001 From: Guan-Yu Lin Date: Thu, 11 Sep 2025 14:20:14 +0000 Subject: usb: offload: add apis for offload usage tracking Introduce offload_usage and corresponding apis to track offload usage on each USB device. Offload denotes that there is another co-processor accessing the USB device via the same USB host controller. To optimize power usage, it's essential to monitor whether the USB device is actively used by other co-processor. This information is vital when determining if a USB device can be safely suspended during system power state transitions. Signed-off-by: Guan-Yu Lin Link: https://lore.kernel.org/r/20250911142051.90822-3-guanyulin@google.com Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250911142051.90822-3-guanyulin@google.com --- drivers/usb/core/Makefile | 1 + drivers/usb/core/offload.c | 136 +++++++++++++++++++++++++++++++++++++++++++++ drivers/usb/core/usb.c | 1 + include/linux/usb.h | 18 ++++++ 4 files changed, 156 insertions(+) create mode 100644 drivers/usb/core/offload.c (limited to 'include/linux') diff --git a/drivers/usb/core/Makefile b/drivers/usb/core/Makefile index ac006abd13b3..766000b4939e 100644 --- a/drivers/usb/core/Makefile +++ b/drivers/usb/core/Makefile @@ -9,6 +9,7 @@ usbcore-y += devio.o notify.o generic.o quirks.o devices.o usbcore-y += phy.o port.o usbcore-$(CONFIG_OF) += of.o +usbcore-$(CONFIG_USB_XHCI_SIDEBAND) += offload.o usbcore-$(CONFIG_USB_PCI) += hcd-pci.o usbcore-$(CONFIG_ACPI) += usb-acpi.o diff --git a/drivers/usb/core/offload.c b/drivers/usb/core/offload.c new file mode 100644 index 000000000000..7c699f1b8d2b --- /dev/null +++ b/drivers/usb/core/offload.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * offload.c - USB offload related functions + * + * Copyright (c) 2025, Google LLC. + * + * Author: Guan-Yu Lin + */ + +#include + +#include "usb.h" + +/** + * usb_offload_get - increment the offload_usage of a USB device + * @udev: the USB device to increment its offload_usage + * + * Incrementing the offload_usage of a usb_device indicates that offload is + * enabled on this usb_device; that is, another entity is actively handling USB + * transfers. This information allows the USB driver to adjust its power + * management policy based on offload activity. + * + * Return: 0 on success. A negative error code otherwise. + */ +int usb_offload_get(struct usb_device *udev) +{ + int ret; + + usb_lock_device(udev); + if (udev->state == USB_STATE_NOTATTACHED) { + usb_unlock_device(udev); + return -ENODEV; + } + + if (udev->state == USB_STATE_SUSPENDED || + udev->offload_at_suspend) { + usb_unlock_device(udev); + return -EBUSY; + } + + /* + * offload_usage could only be modified when the device is active, since + * it will alter the suspend flow of the device. + */ + ret = usb_autoresume_device(udev); + if (ret < 0) { + usb_unlock_device(udev); + return ret; + } + + udev->offload_usage++; + usb_autosuspend_device(udev); + usb_unlock_device(udev); + + return ret; +} +EXPORT_SYMBOL_GPL(usb_offload_get); + +/** + * usb_offload_put - drop the offload_usage of a USB device + * @udev: the USB device to drop its offload_usage + * + * The inverse operation of usb_offload_get, which drops the offload_usage of + * a USB device. This information allows the USB driver to adjust its power + * management policy based on offload activity. + * + * Return: 0 on success. A negative error code otherwise. + */ +int usb_offload_put(struct usb_device *udev) +{ + int ret; + + usb_lock_device(udev); + if (udev->state == USB_STATE_NOTATTACHED) { + usb_unlock_device(udev); + return -ENODEV; + } + + if (udev->state == USB_STATE_SUSPENDED || + udev->offload_at_suspend) { + usb_unlock_device(udev); + return -EBUSY; + } + + /* + * offload_usage could only be modified when the device is active, since + * it will alter the suspend flow of the device. + */ + ret = usb_autoresume_device(udev); + if (ret < 0) { + usb_unlock_device(udev); + return ret; + } + + /* Drop the count when it wasn't 0, ignore the operation otherwise. */ + if (udev->offload_usage) + udev->offload_usage--; + usb_autosuspend_device(udev); + usb_unlock_device(udev); + + return ret; +} +EXPORT_SYMBOL_GPL(usb_offload_put); + +/** + * usb_offload_check - check offload activities on a USB device + * @udev: the USB device to check its offload activity. + * + * Check if there are any offload activity on the USB device right now. This + * information could be used for power management or other forms of resource + * management. + * + * The caller must hold @udev's device lock. In addition, the caller should + * ensure downstream usb devices are all either suspended or marked as + * "offload_at_suspend" to ensure the correctness of the return value. + * + * Returns true on any offload activity, false otherwise. + */ +bool usb_offload_check(struct usb_device *udev) __must_hold(&udev->dev->mutex) +{ + struct usb_device *child; + bool active; + int port1; + + usb_hub_for_each_child(udev, port1, child) { + usb_lock_device(child); + active = usb_offload_check(child); + usb_unlock_device(child); + if (active) + return true; + } + + return !!udev->offload_usage; +} +EXPORT_SYMBOL_GPL(usb_offload_check); diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index b88b6271cb30..b6b0b8489523 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -670,6 +670,7 @@ struct usb_device *usb_alloc_dev(struct usb_device *parent, set_dev_node(&dev->dev, dev_to_node(bus->sysdev)); dev->state = USB_STATE_ATTACHED; dev->lpm_disable_count = 1; + dev->offload_usage = 0; atomic_set(&dev->urbnum, 0); INIT_LIST_HEAD(&dev->ep0.urb_list); diff --git a/include/linux/usb.h b/include/linux/usb.h index 70ef00c42d22..e85105939af8 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -636,6 +636,8 @@ struct usb3_lpm_parameters { * @do_remote_wakeup: remote wakeup should be enabled * @reset_resume: needs reset instead of resume * @port_is_suspended: the upstream port is suspended (L2 or U3) + * @offload_at_suspend: offload activities during suspend is enabled. + * @offload_usage: number of offload activities happening on this usb device. * @slot_id: Slot ID assigned by xHCI * @l1_params: best effor service latency for USB2 L1 LPM state, and L1 timeout. * @u1_params: exit latencies for USB3 U1 LPM state, and hub-initiated timeout. @@ -724,6 +726,8 @@ struct usb_device { unsigned do_remote_wakeup:1; unsigned reset_resume:1; unsigned port_is_suspended:1; + unsigned offload_at_suspend:1; + int offload_usage; enum usb_link_tunnel_mode tunnel_mode; struct device_link *usb4_link; @@ -841,6 +845,20 @@ static inline void usb_mark_last_busy(struct usb_device *udev) { } #endif +#if IS_ENABLED(CONFIG_USB_XHCI_SIDEBAND) +int usb_offload_get(struct usb_device *udev); +int usb_offload_put(struct usb_device *udev); +bool usb_offload_check(struct usb_device *udev); +#else + +static inline int usb_offload_get(struct usb_device *udev) +{ return 0; } +static inline int usb_offload_put(struct usb_device *udev) +{ return 0; } +static inline bool usb_offload_check(struct usb_device *udev) +{ return false; } +#endif + extern int usb_disable_lpm(struct usb_device *udev); extern void usb_enable_lpm(struct usb_device *udev); /* Same as above, but these functions lock/unlock the bandwidth_mutex. */ -- cgit v1.2.3 From ef82a4803aabaf623bfcae07981406f1386eabf9 Mon Sep 17 00:00:00 2001 From: Guan-Yu Lin Date: Thu, 11 Sep 2025 14:20:15 +0000 Subject: xhci: sideband: add api to trace sideband usage The existing sideband driver only registers sidebands without tracking their active usage. To address this, sideband will now record its active usage when it creates/removes interrupters. In addition, a new api is introduced to provide a means for other dirvers to fetch sideband activity information on a USB host controller. Signed-off-by: Guan-Yu Lin Link: https://lore.kernel.org/r/20250911142051.90822-4-guanyulin@google.com Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250911142051.90822-4-guanyulin@google.com --- drivers/usb/host/xhci-sideband.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/usb/xhci-sideband.h | 9 +++++++++ 2 files changed, 45 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/host/xhci-sideband.c b/drivers/usb/host/xhci-sideband.c index d49f9886dd84..e771a476fef2 100644 --- a/drivers/usb/host/xhci-sideband.c +++ b/drivers/usb/host/xhci-sideband.c @@ -266,6 +266,31 @@ xhci_sideband_get_event_buffer(struct xhci_sideband *sb) } EXPORT_SYMBOL_GPL(xhci_sideband_get_event_buffer); +/** + * xhci_sideband_check - check the existence of active sidebands + * @hcd: the host controller driver associated with the target host controller + * + * Allow other drivers, such as usb controller driver, to check if there are + * any sideband activity on the host controller. This information could be used + * for power management or other forms of resource management. The caller should + * ensure downstream usb devices are all either suspended or marked as + * "offload_at_suspend" to ensure the correctness of the return value. + * + * Returns true on any active sideband existence, false otherwise. + */ +bool xhci_sideband_check(struct usb_hcd *hcd) +{ + struct usb_device *udev = hcd->self.root_hub; + bool active; + + usb_lock_device(udev); + active = usb_offload_check(udev); + usb_unlock_device(udev); + + return active; +} +EXPORT_SYMBOL_GPL(xhci_sideband_check); + /** * xhci_sideband_create_interrupter - creates a new interrupter for this sideband * @sb: sideband instance for this usb device @@ -286,6 +311,7 @@ xhci_sideband_create_interrupter(struct xhci_sideband *sb, int num_seg, bool ip_autoclear, u32 imod_interval, int intr_num) { int ret = 0; + struct usb_device *udev; if (!sb || !sb->xhci) return -ENODEV; @@ -304,6 +330,9 @@ xhci_sideband_create_interrupter(struct xhci_sideband *sb, int num_seg, goto out; } + udev = sb->vdev->udev; + ret = usb_offload_get(udev); + sb->ir->ip_autoclear = ip_autoclear; out: @@ -323,6 +352,8 @@ EXPORT_SYMBOL_GPL(xhci_sideband_create_interrupter); void xhci_sideband_remove_interrupter(struct xhci_sideband *sb) { + struct usb_device *udev; + if (!sb || !sb->ir) return; @@ -330,6 +361,11 @@ xhci_sideband_remove_interrupter(struct xhci_sideband *sb) xhci_remove_secondary_interrupter(xhci_to_hcd(sb->xhci), sb->ir); sb->ir = NULL; + udev = sb->vdev->udev; + + if (udev->state != USB_STATE_NOTATTACHED) + usb_offload_put(udev); + mutex_unlock(&sb->mutex); } EXPORT_SYMBOL_GPL(xhci_sideband_remove_interrupter); diff --git a/include/linux/usb/xhci-sideband.h b/include/linux/usb/xhci-sideband.h index 45288c392f6e..005257085dcb 100644 --- a/include/linux/usb/xhci-sideband.h +++ b/include/linux/usb/xhci-sideband.h @@ -11,6 +11,7 @@ #include #include +#include #define EP_CTX_PER_DEV 31 /* FIXME defined twice, from xhci.h */ @@ -83,6 +84,14 @@ xhci_sideband_get_endpoint_buffer(struct xhci_sideband *sb, struct usb_host_endpoint *host_ep); struct sg_table * xhci_sideband_get_event_buffer(struct xhci_sideband *sb); + +#if IS_ENABLED(CONFIG_USB_XHCI_SIDEBAND) +bool xhci_sideband_check(struct usb_hcd *hcd); +#else +static inline bool xhci_sideband_check(struct usb_hcd *hcd) +{ return false; } +#endif /* IS_ENABLED(CONFIG_USB_XHCI_SIDEBAND) */ + int xhci_sideband_create_interrupter(struct xhci_sideband *sb, int num_seg, bool ip_autoclear, u32 imod_interval, int intr_num); -- cgit v1.2.3 From f338529ca9279e3bea392cb53cec8bd292909cb1 Mon Sep 17 00:00:00 2001 From: Sarthak Garg Date: Mon, 8 Sep 2025 16:11:21 +0530 Subject: mmc: core: Parse and use the new max-sd-hs-hz DT property Introduce a new device tree flag to cap the maximum High-Speed (HS) mode frequency for SD cards, accommodating board-specific electrical limitations which cannot support the default 50Mhz HS frequency and others. Signed-off-by: Sarthak Garg Signed-off-by: Ulf Hansson --- drivers/mmc/core/host.c | 2 ++ drivers/mmc/core/sd.c | 2 +- include/linux/mmc/host.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index 5f0ec23aeff5..88c95dbfd9cf 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -302,6 +302,8 @@ int mmc_of_parse(struct mmc_host *host) /* f_max is obtained from the optional "max-frequency" property */ device_property_read_u32(dev, "max-frequency", &host->f_max); + device_property_read_u32(dev, "max-sd-hs-hz", &host->max_sd_hs_hz); + /* * Configure CD and WP pins. They are both by default active low to * match the SDHCI spec. If GPIOs are provided for CD and / or WP, the diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index ec02067f03c5..67cd63004829 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -359,7 +359,7 @@ static int mmc_read_switch(struct mmc_card *card) } if (status[13] & SD_MODE_HIGH_SPEED) - card->sw_caps.hs_max_dtr = HIGH_SPEED_MAX_DTR; + card->sw_caps.hs_max_dtr = card->host->max_sd_hs_hz ?: HIGH_SPEED_MAX_DTR; if (card->scr.sda_spec3) { card->sw_caps.sd3_bus_mode = status[13]; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index e0d935a4ac1d..e0e2c265e5d1 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -576,6 +576,7 @@ struct mmc_host { int hsq_depth; u32 err_stats[MMC_ERR_MAX]; + u32 max_sd_hs_hz; unsigned long private[] ____cacheline_aligned; }; -- cgit v1.2.3 From 70da02061499ca89ab92f7c4310f815d5fe674ec Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 8 Sep 2025 07:35:22 +0000 Subject: iio: add power and energy measurement modifiers Add new IIO modifiers to support power and energy measurement devices: Power modifiers: - IIO_MOD_ACTIVE: Real power consumed by the load - IIO_MOD_REACTIVE: Power that oscillates between source and load - IIO_MOD_APPARENT: Magnitude of complex power Signal quality modifiers: - IIO_MOD_RMS: Root Mean Square value Additionally adds: - IIO_CHAN_INFO_POWERFACTOR: Power factor channel info type for representing the ratio of active power to apparent power These modifiers enable proper representation of power measurement devices like energy meters and power analyzers. Signed-off-by: Antoniu Miclaus Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 29 +++++++++++++++++++++++++++++ drivers/iio/industrialio-core.c | 5 +++++ include/linux/iio/types.h | 1 + include/uapi/linux/iio/types.h | 4 ++++ tools/iio/iio_event_monitor.c | 8 ++++++++ 5 files changed, 47 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 2fb2cea4b192..78da68826307 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -167,7 +167,18 @@ Description: is required is a consistent labeling. Units after application of scale and offset are millivolts. +What: /sys/bus/iio/devices/iio:deviceX/in_altvoltageY_rms_raw +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Raw (unscaled) Root Mean Square (RMS) voltage measurement from + channel Y. Units after application of scale and offset are + millivolts. + What: /sys/bus/iio/devices/iio:deviceX/in_powerY_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_active_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_reactive_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_apparent_raw KernelVersion: 4.5 Contact: linux-iio@vger.kernel.org Description: @@ -176,6 +187,13 @@ Description: unique to allow association with event codes. Units after application of scale and offset are milliwatts. +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_powerfactor +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Power factor measurement from channel Y. Power factor is the + ratio of active power to apparent power. The value is unitless. + What: /sys/bus/iio/devices/iio:deviceX/in_capacitanceY_raw KernelVersion: 3.2 Contact: linux-iio@vger.kernel.org @@ -1569,6 +1587,9 @@ Description: What: /sys/.../iio:deviceX/in_energy_input What: /sys/.../iio:deviceX/in_energy_raw +What: /sys/.../iio:deviceX/in_energyY_active_raw +What: /sys/.../iio:deviceX/in_energyY_reactive_raw +What: /sys/.../iio:deviceX/in_energyY_apparent_raw KernelVersion: 4.0 Contact: linux-iio@vger.kernel.org Description: @@ -1707,6 +1728,14 @@ Description: component of the signal while the 'q' channel contains the quadrature component. +What: /sys/bus/iio/devices/iio:deviceX/in_altcurrentY_rms_raw +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Raw (unscaled no bias removal etc.) Root Mean Square (RMS) current + measurement from channel Y. Units after application of scale and + offset are milliamps. + What: /sys/.../iio:deviceX/in_energy_en What: /sys/.../iio:deviceX/in_distance_en What: /sys/.../iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_en diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 4ff2c44f9324..88c3d585a1bd 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -153,6 +153,10 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_PITCH] = "pitch", [IIO_MOD_YAW] = "yaw", [IIO_MOD_ROLL] = "roll", + [IIO_MOD_RMS] = "rms", + [IIO_MOD_ACTIVE] = "active", + [IIO_MOD_REACTIVE] = "reactive", + [IIO_MOD_APPARENT] = "apparent", }; /* relies on pairs of these shared then separate */ @@ -190,6 +194,7 @@ static const char * const iio_chan_info_postfix[] = { [IIO_CHAN_INFO_ZEROPOINT] = "zeropoint", [IIO_CHAN_INFO_TROUGH] = "trough_raw", [IIO_CHAN_INFO_CONVDELAY] = "convdelay", + [IIO_CHAN_INFO_POWERFACTOR] = "powerfactor", }; /** * iio_device_id() - query the unique ID for the device diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index ad2761efcc83..34eebad12d2c 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -70,6 +70,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_ZEROPOINT, IIO_CHAN_INFO_TROUGH, IIO_CHAN_INFO_CONVDELAY, + IIO_CHAN_INFO_POWERFACTOR, }; #endif /* _IIO_TYPES_H_ */ diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index 3c3cc1497a1e..6d269b844271 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -109,6 +109,10 @@ enum iio_modifier { IIO_MOD_ROLL, IIO_MOD_LIGHT_UVA, IIO_MOD_LIGHT_UVB, + IIO_MOD_RMS, + IIO_MOD_ACTIVE, + IIO_MOD_REACTIVE, + IIO_MOD_APPARENT, }; enum iio_event_type { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index d26aff649f3f..03ca33869ce8 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -141,6 +141,10 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_PITCH] = "pitch", [IIO_MOD_YAW] = "yaw", [IIO_MOD_ROLL] = "roll", + [IIO_MOD_RMS] = "rms", + [IIO_MOD_ACTIVE] = "active", + [IIO_MOD_REACTIVE] = "reactive", + [IIO_MOD_APPARENT] = "apparent", }; static bool event_is_known(struct iio_event_data *event) @@ -240,6 +244,10 @@ static bool event_is_known(struct iio_event_data *event) case IIO_MOD_PM4: case IIO_MOD_PM10: case IIO_MOD_O2: + case IIO_MOD_RMS: + case IIO_MOD_ACTIVE: + case IIO_MOD_REACTIVE: + case IIO_MOD_APPARENT: break; default: return false; -- cgit v1.2.3 From 3422b4bc606eee2ba7758ea9347c83332eeec3e3 Mon Sep 17 00:00:00 2001 From: Duje Mihanović Date: Thu, 11 Sep 2025 14:43:45 +0200 Subject: iio: adc: Add driver for Marvell 88PM886 PMIC ADC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marvell's 88PM886 PMIC has a so-called General Purpose ADC used for monitoring various system voltages and temperatures. Add the relevant register definitions to the MFD header and a driver for the ADC. Acked-by: Karel Balej # for the PMIC Signed-off-by: Duje Mihanović Signed-off-by: Jonathan Cameron --- MAINTAINERS | 5 + drivers/iio/adc/88pm886-gpadc.c | 393 ++++++++++++++++++++++++++++++++++++++++ drivers/iio/adc/Kconfig | 13 ++ drivers/iio/adc/Makefile | 1 + include/linux/mfd/88pm886.h | 58 ++++++ 5 files changed, 470 insertions(+) create mode 100644 drivers/iio/adc/88pm886-gpadc.c (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 75615d82593e..5c572204933d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14714,6 +14714,11 @@ F: drivers/regulator/88pm886-regulator.c F: drivers/rtc/rtc-88pm886.c F: include/linux/mfd/88pm886.h +MARVELL 88PM886 PMIC GPADC DRIVER +M: Duje Mihanović +S: Maintained +F: drivers/iio/adc/88pm886-gpadc.c + MARVELL ARMADA 3700 PHY DRIVERS M: Miquel Raynal S: Maintained diff --git a/drivers/iio/adc/88pm886-gpadc.c b/drivers/iio/adc/88pm886-gpadc.c new file mode 100644 index 000000000000..cffe35136685 --- /dev/null +++ b/drivers/iio/adc/88pm886-gpadc.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2025, Duje Mihanović + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +struct pm886_gpadc { + struct regmap *map; +}; + +enum pm886_gpadc_channel { + VSC_CHAN, + VCHG_PWR_CHAN, + VCF_OUT_CHAN, + VBAT_CHAN, + VBAT_SLP_CHAN, + VBUS_CHAN, + + GPADC0_CHAN, + GPADC1_CHAN, + GPADC2_CHAN, + GPADC3_CHAN, + + GND_DET1_CHAN, + GND_DET2_CHAN, + MIC_DET_CHAN, + + TINT_CHAN, +}; + +static const int pm886_gpadc_regs[] = { + [VSC_CHAN] = PM886_REG_GPADC_VSC, + [VCHG_PWR_CHAN] = PM886_REG_GPADC_VCHG_PWR, + [VCF_OUT_CHAN] = PM886_REG_GPADC_VCF_OUT, + [VBAT_CHAN] = PM886_REG_GPADC_VBAT, + [VBAT_SLP_CHAN] = PM886_REG_GPADC_VBAT_SLP, + [VBUS_CHAN] = PM886_REG_GPADC_VBUS, + + [GPADC0_CHAN] = PM886_REG_GPADC_GPADC0, + [GPADC1_CHAN] = PM886_REG_GPADC_GPADC1, + [GPADC2_CHAN] = PM886_REG_GPADC_GPADC2, + [GPADC3_CHAN] = PM886_REG_GPADC_GPADC3, + + [GND_DET1_CHAN] = PM886_REG_GPADC_GND_DET1, + [GND_DET2_CHAN] = PM886_REG_GPADC_GND_DET2, + [MIC_DET_CHAN] = PM886_REG_GPADC_MIC_DET, + + [TINT_CHAN] = PM886_REG_GPADC_TINT, +}; + +#define ADC_CHANNEL_VOLTAGE(index, lsb, name) \ +{ \ + .type = IIO_VOLTAGE, \ + .indexed = 1, \ + .channel = index, \ + .address = lsb, \ + .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \ + BIT(IIO_CHAN_INFO_SCALE), \ + .datasheet_name = name, \ +} + +#define ADC_CHANNEL_RESISTANCE(index, lsb, name) \ +{ \ + .type = IIO_RESISTANCE, \ + .indexed = 1, \ + .channel = index, \ + .address = lsb, \ + .info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED), \ + .datasheet_name = name, \ +} + +#define ADC_CHANNEL_TEMPERATURE(index, lsb, name) \ +{ \ + .type = IIO_TEMP, \ + .indexed = 1, \ + .channel = index, \ + .address = lsb, \ + .info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \ + BIT(IIO_CHAN_INFO_SCALE) | \ + BIT(IIO_CHAN_INFO_OFFSET), \ + .datasheet_name = name, \ +} + +static const struct iio_chan_spec pm886_gpadc_channels[] = { + ADC_CHANNEL_VOLTAGE(VSC_CHAN, 1367, "vsc"), + ADC_CHANNEL_VOLTAGE(VCHG_PWR_CHAN, 1709, "vchg_pwr"), + ADC_CHANNEL_VOLTAGE(VCF_OUT_CHAN, 1367, "vcf_out"), + ADC_CHANNEL_VOLTAGE(VBAT_CHAN, 1367, "vbat"), + ADC_CHANNEL_VOLTAGE(VBAT_SLP_CHAN, 1367, "vbat_slp"), + ADC_CHANNEL_VOLTAGE(VBUS_CHAN, 1709, "vbus"), + + ADC_CHANNEL_RESISTANCE(GPADC0_CHAN, 342, "gpadc0"), + ADC_CHANNEL_RESISTANCE(GPADC1_CHAN, 342, "gpadc1"), + ADC_CHANNEL_RESISTANCE(GPADC2_CHAN, 342, "gpadc2"), + ADC_CHANNEL_RESISTANCE(GPADC3_CHAN, 342, "gpadc3"), + + ADC_CHANNEL_VOLTAGE(GND_DET1_CHAN, 342, "gnddet1"), + ADC_CHANNEL_VOLTAGE(GND_DET2_CHAN, 342, "gnddet2"), + ADC_CHANNEL_VOLTAGE(MIC_DET_CHAN, 1367, "mic_det"), + + ADC_CHANNEL_TEMPERATURE(TINT_CHAN, 104, "tint"), +}; + +static const struct regmap_config pm886_gpadc_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = PM886_GPADC_MAX_REGISTER, +}; + +static int gpadc_get_raw(struct iio_dev *iio, enum pm886_gpadc_channel chan) +{ + struct pm886_gpadc *gpadc = iio_priv(iio); + __be16 buf; + int ret; + + ret = regmap_bulk_read(gpadc->map, pm886_gpadc_regs[chan], &buf, sizeof(buf)); + if (ret) + return ret; + + return be16_to_cpu(buf) >> 4; +} + +static int +gpadc_set_bias(struct pm886_gpadc *gpadc, enum pm886_gpadc_channel chan, bool on) +{ + unsigned int gpadc_num = chan - GPADC0_CHAN; + unsigned int bits = BIT(gpadc_num + 4) | BIT(gpadc_num); + + return regmap_assign_bits(gpadc->map, PM886_REG_GPADC_CONFIG(0x14), bits, on); +} + +static int +gpadc_find_bias_current(struct iio_dev *iio, struct iio_chan_spec const *chan, + unsigned int *raw_uV, unsigned int *raw_uA) +{ + struct pm886_gpadc *gpadc = iio_priv(iio); + unsigned int gpadc_num = chan->channel - GPADC0_CHAN; + unsigned int reg = PM886_REG_GPADC_CONFIG(0xb + gpadc_num); + unsigned long lsb = chan->address; + int ret; + + for (unsigned int i = 0; i < PM886_GPADC_BIAS_LEVELS; i++) { + ret = regmap_update_bits(gpadc->map, reg, GENMASK(3, 0), i); + if (ret) + return ret; + + /* Wait for the new bias level to apply. */ + fsleep(5 * USEC_PER_MSEC); + + *raw_uA = PM886_GPADC_INDEX_TO_BIAS_uA(i); + *raw_uV = gpadc_get_raw(iio, chan->channel) * lsb; + + /* + * Vendor kernel errors out above 1.25 V, but testing shows + * that the resistance of the battery detection channel (GPADC2 + * on coreprimevelte) reaches about 1.4 MΩ when the battery is + * removed, which can't be measured with such a low upper + * limit. Therefore, to be able to detect the battery without + * ugly externs as used in the vendor fuel gauge driver, + * increase this limit a bit. + */ + if (WARN_ON(*raw_uV > 1500 * (MICRO / MILLI))) + return -EIO; + + /* + * Vendor kernel errors out under 300 mV, but for the same + * reason as above (except the channel hovers around 3.5 kΩ + * with battery present) reduce this limit. + */ + if (*raw_uV < 200 * (MICRO / MILLI)) { + dev_dbg(&iio->dev, "bad bias for chan %d: %d uA @ %d uV\n", + chan->channel, *raw_uA, *raw_uV); + continue; + } + + dev_dbg(&iio->dev, "good bias for chan %d: %d uA @ %d uV\n", + chan->channel, *raw_uA, *raw_uV); + return 0; + } + + dev_err(&iio->dev, "failed to find good bias for chan %d\n", chan->channel); + return -EINVAL; +} + +static int +gpadc_get_resistance_ohm(struct iio_dev *iio, struct iio_chan_spec const *chan) +{ + struct pm886_gpadc *gpadc = iio_priv(iio); + unsigned int raw_uV, raw_uA; + int ret; + + ret = gpadc_set_bias(gpadc, chan->channel, true); + if (ret) + goto out; + + ret = gpadc_find_bias_current(iio, chan, &raw_uV, &raw_uA); + if (ret) + goto out; + + ret = DIV_ROUND_CLOSEST(raw_uV, raw_uA); +out: + gpadc_set_bias(gpadc, chan->channel, false); + return ret; +} + +static int +__pm886_gpadc_read_raw(struct iio_dev *iio, struct iio_chan_spec const *chan, + int *val, int *val2, long mask) +{ + unsigned long lsb = chan->address; + + switch (mask) { + case IIO_CHAN_INFO_RAW: + *val = gpadc_get_raw(iio, chan->channel); + if (*val < 0) + return *val; + + return IIO_VAL_INT; + case IIO_CHAN_INFO_SCALE: + *val = lsb; + + if (chan->type == IIO_VOLTAGE) { + *val2 = MILLI; + return IIO_VAL_FRACTIONAL; + } else { + return IIO_VAL_INT; + } + case IIO_CHAN_INFO_OFFSET: + /* Raw value is 104 millikelvin/LSB, convert it to 104 millicelsius/LSB */ + *val = ABSOLUTE_ZERO_MILLICELSIUS; + *val2 = lsb; + return IIO_VAL_FRACTIONAL; + case IIO_CHAN_INFO_PROCESSED: + *val = gpadc_get_resistance_ohm(iio, chan); + if (*val < 0) + return *val; + + return IIO_VAL_INT; + default: + return -EINVAL; + } +} + +static int pm886_gpadc_read_raw(struct iio_dev *iio, struct iio_chan_spec const *chan, + int *val, int *val2, long mask) +{ + struct device *dev = iio->dev.parent; + int ret; + + ret = pm_runtime_resume_and_get(dev); + if (ret) + return ret; + + ret = __pm886_gpadc_read_raw(iio, chan, val, val2, mask); + + pm_runtime_put_autosuspend(dev); + return ret; +} + +static int pm886_gpadc_hw_enable(struct regmap *map) +{ + const u8 config[] = { + PM886_GPADC_CONFIG1_EN_ALL, + PM886_GPADC_CONFIG2_EN_ALL, + PM886_GPADC_GND_DET2_EN, + }; + int ret; + + /* Enable the ADC block. */ + ret = regmap_set_bits(map, PM886_REG_GPADC_CONFIG(0x6), BIT(0)); + if (ret) + return ret; + + /* Enable all channels. */ + return regmap_bulk_write(map, PM886_REG_GPADC_CONFIG(0x1), config, ARRAY_SIZE(config)); +} + +static int pm886_gpadc_hw_disable(struct regmap *map) +{ + return regmap_clear_bits(map, PM886_REG_GPADC_CONFIG(0x6), BIT(0)); +} + +static const struct iio_info pm886_gpadc_iio_info = { + .read_raw = pm886_gpadc_read_raw, +}; + +static int pm886_gpadc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct pm886_chip *chip = dev_get_drvdata(dev->parent); + struct i2c_client *client = chip->client; + struct pm886_gpadc *gpadc; + struct i2c_client *page; + struct iio_dev *iio; + int ret; + + iio = devm_iio_device_alloc(dev, sizeof(*gpadc)); + if (!iio) + return -ENOMEM; + + gpadc = iio_priv(iio); + dev_set_drvdata(dev, iio); + + page = devm_i2c_new_dummy_device(dev, client->adapter, + client->addr + PM886_PAGE_OFFSET_GPADC); + if (IS_ERR(page)) + return dev_err_probe(dev, PTR_ERR(page), "Failed to initialize GPADC page\n"); + + gpadc->map = devm_regmap_init_i2c(page, &pm886_gpadc_regmap_config); + if (IS_ERR(gpadc->map)) + return dev_err_probe(dev, PTR_ERR(gpadc->map), + "Failed to initialize GPADC regmap\n"); + + iio->name = "88pm886-gpadc"; + iio->modes = INDIO_DIRECT_MODE; + iio->info = &pm886_gpadc_iio_info; + iio->channels = pm886_gpadc_channels; + iio->num_channels = ARRAY_SIZE(pm886_gpadc_channels); + device_set_node(&iio->dev, dev_fwnode(dev->parent)); + + ret = devm_pm_runtime_enable(dev); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable runtime PM\n"); + + pm_runtime_set_autosuspend_delay(dev, 50); + pm_runtime_use_autosuspend(dev); + ret = devm_iio_device_register(dev, iio); + if (ret) + return dev_err_probe(dev, ret, "Failed to register ADC\n"); + + return 0; +} + +static int pm886_gpadc_runtime_resume(struct device *dev) +{ + struct iio_dev *iio = dev_get_drvdata(dev); + struct pm886_gpadc *gpadc = iio_priv(iio); + + return pm886_gpadc_hw_enable(gpadc->map); +} + +static int pm886_gpadc_runtime_suspend(struct device *dev) +{ + struct iio_dev *iio = dev_get_drvdata(dev); + struct pm886_gpadc *gpadc = iio_priv(iio); + + return pm886_gpadc_hw_disable(gpadc->map); +} + +static DEFINE_RUNTIME_DEV_PM_OPS(pm886_gpadc_pm_ops, + pm886_gpadc_runtime_suspend, + pm886_gpadc_runtime_resume, NULL); + +static const struct platform_device_id pm886_gpadc_id[] = { + { "88pm886-gpadc" }, + { } +}; +MODULE_DEVICE_TABLE(platform, pm886_gpadc_id); + +static struct platform_driver pm886_gpadc_driver = { + .driver = { + .name = "88pm886-gpadc", + .pm = pm_ptr(&pm886_gpadc_pm_ops), + }, + .probe = pm886_gpadc_probe, + .id_table = pm886_gpadc_id, +}; +module_platform_driver(pm886_gpadc_driver); + +MODULE_AUTHOR("Duje Mihanović "); +MODULE_DESCRIPTION("Marvell 88PM886 GPADC driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index 09ab25c8cfdb..8c84b2c1d223 100644 --- a/drivers/iio/adc/Kconfig +++ b/drivers/iio/adc/Kconfig @@ -9,6 +9,19 @@ menu "Analog to digital converters" config IIO_ADC_HELPER tristate +config 88PM886_GPADC + tristate "Marvell 88PM886 GPADC driver" + depends on MFD_88PM886_PMIC + default MFD_88PM886_PMIC + help + Say Y here to enable support for the GPADC (General Purpose ADC) + found on the Marvell 88PM886 PMIC. The GPADC measures various + internal voltages and temperatures, including (but not limited to) + system, battery and USB Vbus. + + To compile this driver as a module, choose M here: the module will be + called 88pm886-gpadc. + config AB8500_GPADC bool "ST-Ericsson AB8500 GPADC driver" depends on AB8500_CORE && REGULATOR_AB8500 diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile index 4d071f290911..611c16430621 100644 --- a/drivers/iio/adc/Makefile +++ b/drivers/iio/adc/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_IIO_ADC_HELPER) += industrialio-adc.o # When adding new entries keep the list in alphabetical order +obj-$(CONFIG_88PM886_GPADC) += 88pm886-gpadc.o obj-$(CONFIG_AB8500_GPADC) += ab8500-gpadc.o obj-$(CONFIG_AD_SIGMA_DELTA) += ad_sigma_delta.o obj-$(CONFIG_AD4000) += ad4000.o diff --git a/include/linux/mfd/88pm886.h b/include/linux/mfd/88pm886.h index 85eca44f39ab..38892ba7b8a4 100644 --- a/include/linux/mfd/88pm886.h +++ b/include/linux/mfd/88pm886.h @@ -10,6 +10,7 @@ #define PM886_IRQ_ONKEY 0 #define PM886_PAGE_OFFSET_REGULATORS 1 +#define PM886_PAGE_OFFSET_GPADC 2 #define PM886_REG_ID 0x00 @@ -70,6 +71,63 @@ #define PM886_LDO_VSEL_MASK 0x0f #define PM886_BUCK_VSEL_MASK 0x7f +/* GPADC enable/disable registers */ +#define PM886_REG_GPADC_CONFIG(n) (n) + +#define PM886_GPADC_VSC_EN BIT(0) +#define PM886_GPADC_VBAT_EN BIT(1) +#define PM886_GPADC_GNDDET1_EN BIT(3) +#define PM886_GPADC_VBUS_EN BIT(4) +#define PM886_GPADC_VCHG_PWR_EN BIT(5) +#define PM886_GPADC_VCF_OUT_EN BIT(6) +#define PM886_GPADC_CONFIG1_EN_ALL \ + (PM886_GPADC_VSC_EN | \ + PM886_GPADC_VBAT_EN | \ + PM886_GPADC_GNDDET1_EN | \ + PM886_GPADC_VBUS_EN | \ + PM886_GPADC_VCHG_PWR_EN | \ + PM886_GPADC_VCF_OUT_EN) + +#define PM886_GPADC_TINT_EN BIT(0) +#define PM886_GPADC_PMODE_EN BIT(1) +#define PM886_GPADC_GPADC0_EN BIT(2) +#define PM886_GPADC_GPADC1_EN BIT(3) +#define PM886_GPADC_GPADC2_EN BIT(4) +#define PM886_GPADC_GPADC3_EN BIT(5) +#define PM886_GPADC_MIC_DET_EN BIT(6) +#define PM886_GPADC_CONFIG2_EN_ALL \ + (PM886_GPADC_TINT_EN | \ + PM886_GPADC_GPADC0_EN | \ + PM886_GPADC_GPADC1_EN | \ + PM886_GPADC_GPADC2_EN | \ + PM886_GPADC_GPADC3_EN | \ + PM886_GPADC_MIC_DET_EN) + +/* No CONFIG3_EN_ALL because this is the only bit there. */ +#define PM886_GPADC_GND_DET2_EN BIT(0) + +/* GPADC channel registers */ +#define PM886_REG_GPADC_VSC 0x40 +#define PM886_REG_GPADC_VCHG_PWR 0x4c +#define PM886_REG_GPADC_VCF_OUT 0x4e +#define PM886_REG_GPADC_TINT 0x50 +#define PM886_REG_GPADC_GPADC0 0x54 +#define PM886_REG_GPADC_GPADC1 0x56 +#define PM886_REG_GPADC_GPADC2 0x58 +#define PM886_REG_GPADC_VBAT 0xa0 +#define PM886_REG_GPADC_GND_DET1 0xa4 +#define PM886_REG_GPADC_GND_DET2 0xa6 +#define PM886_REG_GPADC_VBUS 0xa8 +#define PM886_REG_GPADC_GPADC3 0xaa +#define PM886_REG_GPADC_MIC_DET 0xac +#define PM886_REG_GPADC_VBAT_SLP 0xb0 + +/* VBAT_SLP is the last register and is 2 bytes wide like other channels. */ +#define PM886_GPADC_MAX_REGISTER (PM886_REG_GPADC_VBAT_SLP + 1) + +#define PM886_GPADC_BIAS_LEVELS 16 +#define PM886_GPADC_INDEX_TO_BIAS_uA(i) (1 + (i) * 5) + struct pm886_chip { struct i2c_client *client; unsigned int chip_id; -- cgit v1.2.3 From 1ab40529ad52f339975886a6a9e815dfdcb8d011 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Tue, 15 Jul 2025 21:52:54 +0300 Subject: media: uvcvideo: Move MSXU_CONTROL_METADATA definition to header Move the MSXU_CONTROL_METADATA control definitino to the include/linux/usb/uvc.h header, alongside the corresponding XU GUID. Add a UVC_ prefix to avoid namespace clashes. While at it, add the definition for the other controls for that extension unit, as defined in https://learn.microsoft.com/en-us/windows-hardware/drivers/stream/uvc-extensions-1-5#222-extension-unit-controls. Signed-off-by: Laurent Pinchart Reviewed-by: Ricardo Ribalda Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Signed-off-by: Hans Verkuil --- drivers/media/usb/uvc/uvc_metadata.c | 11 +++++------ include/linux/usb/uvc.h | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/media/usb/uvc/uvc_metadata.c b/drivers/media/usb/uvc/uvc_metadata.c index 7368400734a3..b35111d08759 100644 --- a/drivers/media/usb/uvc/uvc_metadata.c +++ b/drivers/media/usb/uvc/uvc_metadata.c @@ -171,7 +171,6 @@ static struct uvc_entity *uvc_meta_find_msxu(struct uvc_device *dev) return NULL; } -#define MSXU_CONTROL_METADATA 0x9 static int uvc_meta_detect_msxu(struct uvc_device *dev) { u32 *data __free(kfree) = NULL; @@ -195,7 +194,7 @@ static int uvc_meta_detect_msxu(struct uvc_device *dev) * returns metadata. */ ret = uvc_query_ctrl(dev, UVC_GET_CUR, entity->id, dev->intfnum, - MSXU_CONTROL_METADATA, data, sizeof(*data)); + UVC_MSXU_CONTROL_METADATA, data, sizeof(*data)); if (ret) return 0; @@ -205,23 +204,23 @@ static int uvc_meta_detect_msxu(struct uvc_device *dev) } /* - * Set the value of MSXU_CONTROL_METADATA to the value reported by + * Set the value of UVC_MSXU_CONTROL_METADATA to the value reported by * GET_MAX to enable production of MSXU metadata. The GET_MAX request * reports the maximum size of the metadata, if its value is 0 then MSXU * metadata is not supported. For more information, see * https://learn.microsoft.com/en-us/windows-hardware/drivers/stream/uvc-extensions-1-5#2229-metadata-control */ ret = uvc_query_ctrl(dev, UVC_GET_MAX, entity->id, dev->intfnum, - MSXU_CONTROL_METADATA, data, sizeof(*data)); + UVC_MSXU_CONTROL_METADATA, data, sizeof(*data)); if (ret || !*data) return 0; /* - * If we can set MSXU_CONTROL_METADATA, the device will report + * If we can set UVC_MSXU_CONTROL_METADATA, the device will report * metadata. */ ret = uvc_query_ctrl(dev, UVC_SET_CUR, entity->id, dev->intfnum, - MSXU_CONTROL_METADATA, data, sizeof(*data)); + UVC_MSXU_CONTROL_METADATA, data, sizeof(*data)); if (!ret) dev->quirks |= UVC_QUIRK_MSXU_META; diff --git a/include/linux/usb/uvc.h b/include/linux/usb/uvc.h index ee19e9f915b8..12a57e1d3467 100644 --- a/include/linux/usb/uvc.h +++ b/include/linux/usb/uvc.h @@ -33,6 +33,23 @@ {0xdc, 0x95, 0x3f, 0x0f, 0x32, 0x26, 0x4e, 0x4c, \ 0x92, 0xc9, 0xa0, 0x47, 0x82, 0xf4, 0x3b, 0xc8} +/* https://learn.microsoft.com/en-us/windows-hardware/drivers/stream/uvc-extensions-1-5#222-extension-unit-controls */ +#define UVC_MSXU_CONTROL_FOCUS 0x01 +#define UVC_MSXU_CONTROL_EXPOSURE 0x02 +#define UVC_MSXU_CONTROL_EVCOMPENSATION 0x03 +#define UVC_MSXU_CONTROL_WHITEBALANCE 0x04 +#define UVC_MSXU_CONTROL_FACE_AUTHENTICATION 0x06 +#define UVC_MSXU_CONTROL_CAMERA_EXTRINSICS 0x07 +#define UVC_MSXU_CONTROL_CAMERA_INTRINSICS 0x08 +#define UVC_MSXU_CONTROL_METADATA 0x09 +#define UVC_MSXU_CONTROL_IR_TORCH 0x0a +#define UVC_MSXU_CONTROL_DIGITALWINDOW 0x0b +#define UVC_MSXU_CONTROL_DIGITALWINDOW_CONFIG 0x0c +#define UVC_MSXU_CONTROL_VIDEO_HDR 0x0d +#define UVC_MSXU_CONTROL_FRAMERATE_THROTTLE 0x0e +#define UVC_MSXU_CONTROL_FIELDOFVIEW2_CONFIG 0x0f +#define UVC_MSXU_CONTROL_FIELDOFVIEW2 0x10 + #define UVC_GUID_FORMAT_MJPEG \ { 'M', 'J', 'P', 'G', 0x00, 0x00, 0x10, 0x00, \ 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -- cgit v1.2.3 From 0f99b8bed426b8f5434b10c3f6f6b92d7ce3c467 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 18 Aug 2025 20:15:39 +0000 Subject: media: uvcvideo: Support UVC_CROSXU_CONTROL_IQ_PROFILE The ChromeOS XU provides a control to change the IQ profile for a camera. It can be switched from VIVID (a.k.a. standard) to NONE (a.k.a. natural). Wire it up to the standard v4l2 control. Signed-off-by: Ricardo Ribalda Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede Reviewed-by: Laurent Pinchart Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- drivers/media/usb/uvc/uvc_ctrl.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/usb/uvc.h | 5 +++++ 2 files changed, 39 insertions(+) (limited to 'include/linux') diff --git a/drivers/media/usb/uvc/uvc_ctrl.c b/drivers/media/usb/uvc/uvc_ctrl.c index 277209ee40c1..2905505c240c 100644 --- a/drivers/media/usb/uvc/uvc_ctrl.c +++ b/drivers/media/usb/uvc/uvc_ctrl.c @@ -376,6 +376,15 @@ static const struct uvc_control_info uvc_ctrls[] = { | UVC_CTRL_FLAG_GET_DEF | UVC_CTRL_FLAG_AUTO_UPDATE, }, + { + .entity = UVC_GUID_CHROMEOS_XU, + .selector = UVC_CROSXU_CONTROL_IQ_PROFILE, + .index = 3, + .size = 1, + .flags = UVC_CTRL_FLAG_SET_CUR + | UVC_CTRL_FLAG_GET_RANGE + | UVC_CTRL_FLAG_RESTORE, + }, }; static const u32 uvc_control_classes[] = { @@ -384,6 +393,19 @@ static const u32 uvc_control_classes[] = { }; static const int exposure_auto_mapping[] = { 2, 1, 4, 8 }; +static const int cros_colorfx_mapping[] = { + 1, /* V4L2_COLORFX_NONE */ + -1, /* V4L2_COLORFX_BW */ + -1, /* V4L2_COLORFX_SEPIA */ + -1, /* V4L2_COLORFX_NEGATIVE */ + -1, /* V4L2_COLORFX_EMBOSS */ + -1, /* V4L2_COLORFX_SKETCH */ + -1, /* V4L2_COLORFX_SKY_BLUE */ + -1, /* V4L2_COLORFX_GRASS_GREEN */ + -1, /* V4L2_COLORFX_SKIN_WHITEN */ + 0, /* V4L2_COLORFX_VIVID */ +}; + static bool uvc_ctrl_mapping_is_compound(struct uvc_control_mapping *mapping) { @@ -975,6 +997,18 @@ static const struct uvc_control_mapping uvc_ctrl_mappings[] = { .data_type = UVC_CTRL_DATA_TYPE_BITMASK, .name = "Region of Interest Auto Ctrls", }, + { + .id = V4L2_CID_COLORFX, + .entity = UVC_GUID_CHROMEOS_XU, + .selector = UVC_CROSXU_CONTROL_IQ_PROFILE, + .size = 8, + .offset = 0, + .v4l2_type = V4L2_CTRL_TYPE_MENU, + .data_type = UVC_CTRL_DATA_TYPE_ENUM, + .menu_mapping = cros_colorfx_mapping, + .menu_mask = BIT(V4L2_COLORFX_VIVID) | + BIT(V4L2_COLORFX_NONE), + }, }; /* ------------------------------------------------------------------------ diff --git a/include/linux/usb/uvc.h b/include/linux/usb/uvc.h index 12a57e1d3467..22e0dab0809e 100644 --- a/include/linux/usb/uvc.h +++ b/include/linux/usb/uvc.h @@ -29,6 +29,9 @@ #define UVC_GUID_EXT_GPIO_CONTROLLER \ {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03} +#define UVC_GUID_CHROMEOS_XU \ + {0x24, 0xe9, 0xd7, 0x74, 0xc9, 0x49, 0x45, 0x4a, \ + 0x98, 0xa3, 0xc8, 0x07, 0x7e, 0x05, 0x1c, 0xa3} #define UVC_GUID_MSXU_1_5 \ {0xdc, 0x95, 0x3f, 0x0f, 0x32, 0x26, 0x4e, 0x4c, \ 0x92, 0xc9, 0xa0, 0x47, 0x82, 0xf4, 0x3b, 0xc8} @@ -50,6 +53,8 @@ #define UVC_MSXU_CONTROL_FIELDOFVIEW2_CONFIG 0x0f #define UVC_MSXU_CONTROL_FIELDOFVIEW2 0x10 +#define UVC_CROSXU_CONTROL_IQ_PROFILE 0x04 + #define UVC_GUID_FORMAT_MJPEG \ { 'M', 'J', 'P', 'G', 0x00, 0x00, 0x10, 0x00, \ 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71} -- cgit v1.2.3 From f1880f9cc1476171bec3dae8fd56fff5665e22a4 Mon Sep 17 00:00:00 2001 From: Stéphane Grosjean Date: Fri, 12 Sep 2025 10:17:19 +0200 Subject: can: peak: Modification of references to email accounts being deleted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the upcoming deletion of @peak-system.com accounts and following the acquisition of PEAK-System and its brand by HMS-Networks, this fix aims to migrate all address references to @hms-networks.com, as well as to map my personal committer addresses to author addresses, while taking the opportunity to correct the accent on the first ‘e’ of my first name. Signed-off-by: Stéphane Grosjean Link: https://patch.msgid.link/20250912081820.86314-1-stephane.grosjean@free.fr Signed-off-by: Marc Kleine-Budde --- .mailmap | 2 ++ drivers/net/can/peak_canfd/peak_canfd.c | 4 ++-- drivers/net/can/peak_canfd/peak_canfd_user.h | 4 ++-- drivers/net/can/peak_canfd/peak_pciefd_main.c | 6 +++--- drivers/net/can/sja1000/peak_pci.c | 6 +++--- drivers/net/can/sja1000/peak_pcmcia.c | 8 ++++---- drivers/net/can/usb/peak_usb/pcan_usb.c | 6 +++--- drivers/net/can/usb/peak_usb/pcan_usb_core.c | 6 +++--- drivers/net/can/usb/peak_usb/pcan_usb_core.h | 4 ++-- drivers/net/can/usb/peak_usb/pcan_usb_fd.c | 3 ++- drivers/net/can/usb/peak_usb/pcan_usb_pro.c | 4 ++-- drivers/net/can/usb/peak_usb/pcan_usb_pro.h | 4 ++-- include/linux/can/dev/peak_canfd.h | 4 ++-- 13 files changed, 32 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/.mailmap b/.mailmap index 4a5f2c8deeff..c393045d9361 100644 --- a/.mailmap +++ b/.mailmap @@ -740,6 +740,8 @@ Sriram Yagnaraman Stanislav Fomichev Stanislav Fomichev Stefan Wahren +Stéphane Grosjean +Stéphane Grosjean Stéphane Witzmann Stephen Hemminger Stephen Hemminger diff --git a/drivers/net/can/peak_canfd/peak_canfd.c b/drivers/net/can/peak_canfd/peak_canfd.c index 77292afaed22..b5bc80ac7876 100644 --- a/drivers/net/can/peak_canfd/peak_canfd.c +++ b/drivers/net/can/peak_canfd/peak_canfd.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2007, 2011 Wolfgang Grandegger - * Copyright (C) 2012 Stephane Grosjean * - * Copyright (C) 2016 PEAK System-Technik GmbH + * Copyright (C) 2016-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include diff --git a/drivers/net/can/peak_canfd/peak_canfd_user.h b/drivers/net/can/peak_canfd/peak_canfd_user.h index a72719dc3b74..60c6542028cf 100644 --- a/drivers/net/can/peak_canfd/peak_canfd_user.h +++ b/drivers/net/can/peak_canfd/peak_canfd_user.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* CAN driver for PEAK System micro-CAN based adapters * - * Copyright (C) 2003-2011 PEAK System-Technik GmbH - * Copyright (C) 2011-2013 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #ifndef PEAK_CANFD_USER_H #define PEAK_CANFD_USER_H diff --git a/drivers/net/can/peak_canfd/peak_pciefd_main.c b/drivers/net/can/peak_canfd/peak_pciefd_main.c index 1df3c4b54f03..93558e33bc02 100644 --- a/drivers/net/can/peak_canfd/peak_pciefd_main.c +++ b/drivers/net/can/peak_canfd/peak_pciefd_main.c @@ -1,10 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2007, 2011 Wolfgang Grandegger - * Copyright (C) 2012 Stephane Grosjean * * Derived from the PCAN project file driver/src/pcan_pci.c: * - * Copyright (C) 2001-2006 PEAK System-Technik GmbH + * Copyright (C) 2001-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include @@ -19,7 +19,7 @@ #include "peak_canfd_user.h" -MODULE_AUTHOR("Stephane Grosjean "); +MODULE_AUTHOR("Stéphane Grosjean "); MODULE_DESCRIPTION("Socket-CAN driver for PEAK PCAN PCIe/M.2 FD family cards"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/can/sja1000/peak_pci.c b/drivers/net/can/sja1000/peak_pci.c index da396d641e24..10d88cbda465 100644 --- a/drivers/net/can/sja1000/peak_pci.c +++ b/drivers/net/can/sja1000/peak_pci.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007, 2011 Wolfgang Grandegger - * Copyright (C) 2012 Stephane Grosjean * * Derived from the PCAN project file driver/src/pcan_pci.c: * - * Copyright (C) 2001-2006 PEAK System-Technik GmbH + * Copyright (C) 2001-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include @@ -22,7 +22,7 @@ #include "sja1000.h" -MODULE_AUTHOR("Stephane Grosjean "); +MODULE_AUTHOR("Stéphane Grosjean "); MODULE_DESCRIPTION("Socket-CAN driver for PEAK PCAN PCI family cards"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/can/sja1000/peak_pcmcia.c b/drivers/net/can/sja1000/peak_pcmcia.c index ce18e9e56059..e1610b527d13 100644 --- a/drivers/net/can/sja1000/peak_pcmcia.c +++ b/drivers/net/can/sja1000/peak_pcmcia.c @@ -1,10 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2010-2012 Stephane Grosjean - * * CAN driver for PEAK-System PCAN-PC Card * Derived from the PCAN project file driver/src/pcan_pccard.c - * Copyright (C) 2006-2010 PEAK System-Technik GmbH + * + * Copyright (C) 2006-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include #include @@ -19,7 +19,7 @@ #include #include "sja1000.h" -MODULE_AUTHOR("Stephane Grosjean "); +MODULE_AUTHOR("Stéphane Grosjean "); MODULE_DESCRIPTION("CAN driver for PEAK-System PCAN-PC Cards"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c index 6b293a9056c2..9278a1522aae 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb.c @@ -3,8 +3,8 @@ * CAN driver for PEAK System PCAN-USB adapter * Derived from the PCAN project file driver/src/pcan_usb.c * - * Copyright (C) 2003-2010 PEAK System-Technik GmbH - * Copyright (C) 2011-2012 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean * * Many thanks to Klaus Hitschler */ @@ -919,7 +919,7 @@ static int pcan_usb_init(struct peak_usb_device *dev) CAN_CTRLMODE_LOOPBACK; } else { dev_info(dev->netdev->dev.parent, - "Firmware update available. Please contact support@peak-system.com\n"); + "Firmware update available. Please contact support.peak@hms-networks.com\n"); } return 0; diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.c b/drivers/net/can/usb/peak_usb/pcan_usb_core.c index 117637b9b995..decbf5ed10bd 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.c @@ -3,8 +3,8 @@ * CAN driver for PEAK System USB adapters * Derived from the PCAN project file driver/src/pcan_usb_core.c * - * Copyright (C) 2003-2010 PEAK System-Technik GmbH - * Copyright (C) 2010-2012 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean * * Many thanks to Klaus Hitschler */ @@ -24,7 +24,7 @@ #include "pcan_usb_core.h" -MODULE_AUTHOR("Stephane Grosjean "); +MODULE_AUTHOR("Stéphane Grosjean "); MODULE_DESCRIPTION("CAN driver for PEAK-System USB adapters"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_core.h b/drivers/net/can/usb/peak_usb/pcan_usb_core.h index abab00930b9d..d1c1897d47b9 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_core.h +++ b/drivers/net/can/usb/peak_usb/pcan_usb_core.h @@ -3,8 +3,8 @@ * CAN driver for PEAK System USB adapters * Derived from the PCAN project file driver/src/pcan_usb_core.c * - * Copyright (C) 2003-2010 PEAK System-Technik GmbH - * Copyright (C) 2010-2012 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean * * Many thanks to Klaus Hitschler */ diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c index ebefc274b50a..be84191cde56 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c @@ -2,7 +2,8 @@ /* * CAN driver for PEAK System PCAN-USB FD / PCAN-USB Pro FD adapter * - * Copyright (C) 2013-2014 Stephane Grosjean + * Copyright (C) 2013-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include #include diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_pro.c b/drivers/net/can/usb/peak_usb/pcan_usb_pro.c index f736196383ac..7be286293b1a 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_pro.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_pro.c @@ -3,8 +3,8 @@ * CAN driver for PEAK System PCAN-USB Pro adapter * Derived from the PCAN project file driver/src/pcan_usbpro.c * - * Copyright (C) 2003-2011 PEAK System-Technik GmbH - * Copyright (C) 2011-2012 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #include #include diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_pro.h b/drivers/net/can/usb/peak_usb/pcan_usb_pro.h index 28e740af905d..162c7546d3a8 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_pro.h +++ b/drivers/net/can/usb/peak_usb/pcan_usb_pro.h @@ -3,8 +3,8 @@ * CAN driver for PEAK System PCAN-USB Pro adapter * Derived from the PCAN project file driver/src/pcan_usbpro_fw.h * - * Copyright (C) 2003-2011 PEAK System-Technik GmbH - * Copyright (C) 2011-2012 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #ifndef PCAN_USB_PRO_H #define PCAN_USB_PRO_H diff --git a/include/linux/can/dev/peak_canfd.h b/include/linux/can/dev/peak_canfd.h index f38772fd0c07..d3788a3d0942 100644 --- a/include/linux/can/dev/peak_canfd.h +++ b/include/linux/can/dev/peak_canfd.h @@ -2,8 +2,8 @@ /* * CAN driver for PEAK System micro-CAN based adapters * - * Copyright (C) 2003-2011 PEAK System-Technik GmbH - * Copyright (C) 2011-2013 Stephane Grosjean + * Copyright (C) 2003-2025 PEAK System-Technik GmbH + * Author: Stéphane Grosjean */ #ifndef PUCAN_H #define PUCAN_H -- cgit v1.2.3 From 6eb350a2233100a283f882c023e5ad426d0ed63b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Aug 2025 17:02:30 +0200 Subject: rseq: Protect event mask against membarrier IPI rseq_need_restart() reads and clears task::rseq_event_mask with preemption disabled to guard against the scheduler. But membarrier() uses an IPI and sets the PREEMPT bit in the event mask from the IPI, which leaves that RMW operation unprotected. Use guard(irq) if CONFIG_MEMBARRIER is enabled to fix that. Fixes: 2a36ab717e8f ("rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ") Signed-off-by: Thomas Gleixner Reviewed-by: Boqun Feng Reviewed-by: Mathieu Desnoyers Cc: stable@vger.kernel.org --- include/linux/rseq.h | 11 ++++++++--- kernel/rseq.c | 10 +++++----- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bc8af3eb5598..1fbeb61babeb 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,6 +7,12 @@ #include #include +#ifdef CONFIG_MEMBARRIER +# define RSEQ_EVENT_GUARD irq +#else +# define RSEQ_EVENT_GUARD preempt +#endif + /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. @@ -41,9 +47,8 @@ static inline void rseq_handle_notify_resume(struct ksignal *ksig, static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - preempt_disable(); - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); rseq_handle_notify_resume(ksig, regs); } diff --git a/kernel/rseq.c b/kernel/rseq.c index b7a1ec327e81..2452b7366b00 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -342,12 +342,12 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) /* * Load and clear event mask atomically with respect to - * scheduler preemption. + * scheduler preemption and membarrier IPIs. */ - preempt_disable(); - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - preempt_enable(); + scoped_guard(RSEQ_EVENT_GUARD) { + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + } return !!event_mask; } -- cgit v1.2.3 From 2da6de30e60dd9bb14600eff1cc99df2fa2ddae3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:23:15 -0700 Subject: mm: folio_may_be_lru_cached() unless folio_test_large() mm/swap.c and mm/mlock.c agree to drain any per-CPU batch as soon as a large folio is added: so collect_longterm_unpinnable_folios() just wastes effort when calling lru_add_drain[_all]() on a large folio. But although there is good reason not to batch up PMD-sized folios, we might well benefit from batching a small number of low-order mTHPs (though unclear how that "small number" limitation will be implemented). So ask if folio_may_be_lru_cached() rather than !folio_test_large(), to insulate those particular checks from future change. Name preferred to "folio_is_batchable" because large folios can well be put on a batch: it's just the per-CPU LRU caches, drained much later, which need care. Marked for stable, to counter the increase in lru_add_drain_all()s from "mm/gup: check ref_count instead of lru before migration". Link: https://lkml.kernel.org/r/57d2eaf8-3607-f318-e0c5-be02dce61ad0@google.com Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Signed-off-by: Hugh Dickins Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 ++++++++++ mm/gup.c | 4 ++-- mm/mlock.c | 6 +++--- mm/swap.c | 2 +- 4 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe6ed2cc3fd..7012a0f758d8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -385,6 +385,16 @@ void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); +static inline bool folio_may_be_lru_cached(struct folio *folio) +{ + /* + * Holding PMD-sized folios in per-CPU LRU cache unbalances accounting. + * Holding small numbers of low-order mTHP folios in per-CPU LRU cache + * will be sensible, but nobody has implemented and tested that yet. + */ + return !folio_test_large(folio); +} + extern atomic_t lru_disable_count; static inline bool lru_cache_disabled(void) diff --git a/mm/gup.c b/mm/gup.c index b47066a54f52..0bc4d140fc07 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2307,13 +2307,13 @@ static unsigned long collect_longterm_unpinnable_folios( continue; } - if (drained == 0 && + if (drained == 0 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain(); drained = 1; } - if (drained == 1 && + if (drained == 1 && folio_may_be_lru_cached(folio) && folio_ref_count(folio) != folio_expected_ref_count(folio) + 1) { lru_add_drain_all(); diff --git a/mm/mlock.c b/mm/mlock.c index a1d93ad33c6d..bb0776f5ef7c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -255,7 +255,7 @@ void mlock_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_lru(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -278,7 +278,7 @@ void mlock_new_folio(struct folio *folio) folio_get(folio); if (!folio_batch_add(fbatch, mlock_new(folio)) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } @@ -299,7 +299,7 @@ void munlock_folio(struct folio *folio) */ folio_get(folio); if (!folio_batch_add(fbatch, folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) mlock_folio_batch(fbatch); local_unlock(&mlock_fbatch.lock); } diff --git a/mm/swap.c b/mm/swap.c index 6ae2d5680574..b74ebe865dd9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -192,7 +192,7 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, local_lock(&cpu_fbatches.lock); if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || - folio_test_large(folio) || lru_cache_disabled()) + !folio_may_be_lru_cached(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) -- cgit v1.2.3 From e6a0deb6fa5b0fc134ee2aa127d1cfc9456d8445 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 8 Sep 2025 13:15:12 -0700 Subject: mm/damon/core: introduce damon_call_control->dealloc_on_cancel Patch series "mm/damon/sysfs: fix refresh_ms control overwriting on multi-kdamonds usages". Automatic esssential DAMON/DAMOS status update feature of DAMON sysfs interface (refresh_ms) is broken [1] for multiple DAMON contexts (kdamonds) use case, since it uses a global single damon_call_control object for all created DAMON contexts. The fields of the object, particularly the list field is over-written for the contexts and it makes unexpected results including user-space hangup and kernel crashes [2]. Fix it by extending damon_call_control for the use case and updating the usage on DAMON sysfs interface to use per-context dynamically allocated damon_call_control object. This patch (of 2): When damon_call_control->repeat is set, damon_call() is executed asynchronously, and is eventually canceled when kdamond finishes. If the damon_call_control object is dynamically allocated, finding the place to deallocate the object is difficult. Introduce a new damon_call_control field, namely dealloc_on_cancel, to ask the kdamond deallocates those dynamically allocated objects when those are canceled. Link: https://lkml.kernel.org/r/20250908201513.60802-3-sj@kernel.org Link: https://lkml.kernel.org/r/20250908201513.60802-2-sj@kernel.org Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Signed-off-by: SeongJae Park Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index f13664c62ddd..9e62b2a85538 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -636,6 +636,7 @@ struct damon_operations { * @data: Data that will be passed to @fn. * @repeat: Repeat invocations. * @return_code: Return code from @fn invocation. + * @dealloc_on_cancel: De-allocate when canceled. * * Control damon_call(), which requests specific kdamond to invoke a given * function. Refer to damon_call() for more details. @@ -645,6 +646,7 @@ struct damon_call_control { void *data; bool repeat; int return_code; + bool dealloc_on_cancel; /* private: internal use only */ /* informs if the kdamond finished handling of the request */ struct completion completion; diff --git a/mm/damon/core.c b/mm/damon/core.c index c2e0b469fd43..08065b363972 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2479,10 +2479,14 @@ static void kdamond_call(struct damon_ctx *ctx, bool cancel) mutex_lock(&ctx->call_controls_lock); list_del(&control->list); mutex_unlock(&ctx->call_controls_lock); - if (!control->repeat) + if (!control->repeat) { complete(&control->completion); - else + } else if (control->canceled && control->dealloc_on_cancel) { + kfree(control); + continue; + } else { list_add(&control->list, &repeat_controls); + } } control = list_first_entry_or_null(&repeat_controls, struct damon_call_control, list); -- cgit v1.2.3 From 337135e6124b6d37d7ef1cd5a6c0b9681938c5ee Mon Sep 17 00:00:00 2001 From: Ruan Shiyang Date: Tue, 29 Jul 2025 11:51:01 +0800 Subject: mm: memory-tiering: fix PGPROMOTE_CANDIDATE counting Goto-san reported confusing pgpromote statistics where the pgpromote_success count significantly exceeded pgpromote_candidate. On a system with three nodes (nodes 0-1: DRAM 4GB, node 2: NVDIMM 4GB): # Enable demotion only echo 1 > /sys/kernel/mm/numa/demotion_enabled numactl -m 0-1 memhog -r200 3500M >/dev/null & pid=$! sleep 2 numactl memhog -r100 2500M >/dev/null & sleep 10 kill -9 $pid # terminate the 1st memhog # Enable promotion echo 2 > /proc/sys/kernel/numa_balancing After a few seconds, we observeed `pgpromote_candidate < pgpromote_success` $ grep -e pgpromote /proc/vmstat pgpromote_success 2579 pgpromote_candidate 0 In this scenario, after terminating the first memhog, the conditions for pgdat_free_space_enough() are quickly met, and triggers promotion. However, these migrated pages are only counted for in PGPROMOTE_SUCCESS, not in PGPROMOTE_CANDIDATE. To solve these confusing statistics, introduce PGPROMOTE_CANDIDATE_NRL to count the missed promotion pages. And also, not counting these pages into PGPROMOTE_CANDIDATE is to avoid changing the existing algorithm or performance of the promotion rate limit. Link: https://lkml.kernel.org/r/20250901090122.124262-1-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/20250729035101.1601407-1-ruansy.fnst@fujitsu.com Fixes: c6833e10008f ("memory tiering: rate limit NUMA migration throughput") Co-developed-by: Li Zhijian Signed-off-by: Li Zhijian Signed-off-by: Ruan Shiyang Reported-by: Yasunori Gotou (Fujitsu) Suggested-by: Huang Ying Acked-by: Vlastimil Babka Reviewed-by: Huang Ying Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 16 +++++++++++++++- kernel/sched/fair.c | 5 +++-- mm/vmstat.c | 1 + 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0c5da9141983..9d3ea9085556 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -234,7 +234,21 @@ enum node_stat_item { #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ - PGPROMOTE_CANDIDATE, /* candidate pages to promote */ + /** + * Candidate pages for promotion based on hint fault latency. This + * counter is used to control the promotion rate and adjust the hot + * threshold. + */ + PGPROMOTE_CANDIDATE, + /** + * Not rate-limited (NRL) candidate pages for those can be promoted + * without considering hot threshold because of enough free pages in + * fast-tier node. These promotions bypass the regular hotness checks + * and do NOT influence the promotion rate-limiter or + * threshold-adjustment logic. + * This is for statistics/monitoring purposes. + */ + PGPROMOTE_CANDIDATE_NRL, #endif /* PGDEMOTE_*: pages demoted */ PGDEMOTE_KSWAPD, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..82c8d804c54c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1923,11 +1923,13 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, struct pglist_data *pgdat; unsigned long rate_limit; unsigned int latency, th, def_th; + long nr = folio_nr_pages(folio); pgdat = NODE_DATA(dst_nid); if (pgdat_free_space_enough(pgdat)) { /* workload changed, reset hot threshold */ pgdat->nbp_threshold = 0; + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr); return true; } @@ -1941,8 +1943,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, if (latency >= th) return false; - return !numa_promotion_rate_limit(pgdat, rate_limit, - folio_nr_pages(folio)); + return !numa_promotion_rate_limit(pgdat, rate_limit, nr); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); diff --git a/mm/vmstat.c b/mm/vmstat.c index 71cd1ceba191..e74f0b2a1021 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1280,6 +1280,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING [I(PGPROMOTE_SUCCESS)] = "pgpromote_success", [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate", + [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl", #endif [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd", [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", -- cgit v1.2.3 From 79e1c24285c40cdfa9eb00fe8131d1ba14b84ef1 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Fri, 18 Jul 2025 10:41:32 +0800 Subject: mm: replace (20 - PAGE_SHIFT) with common macros for pages<->MB conversion Replace repeated (20 - PAGE_SHIFT) calculations with standard macros: - MB_TO_PAGES(mb) converts MB to page count - PAGES_TO_MB(pages) converts pages to MB No functional change. [akpm@linux-foundation.org: remove arc's private PAGES_TO_MB, remove its unused PAGES_TO_KB] [akpm@linux-foundation.org: don't include mm.h due to include file ordering mess] Link: https://lkml.kernel.org/r/20250718024134.1304745-1-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Dev Jain Acked-by: David Hildenbrand Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Dietmar Eggemann Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Josh Triplett Cc: Juri Lelli Cc: Kairui Song Cc: Kemeng Shi Cc: Lai jiangshan Cc: Liam Howlett Cc: Mariano Pache Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Neeraj Upadhyay Cc: Nhat Pham Cc: "Paul E . McKenney" Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/arc/include/asm/arcregs.h | 3 --- include/linux/mm.h | 9 +++++++++ kernel/rcu/rcuscale.c | 2 +- kernel/sched/fair.c | 5 ++--- mm/backing-dev.c | 2 +- mm/huge_memory.c | 2 +- mm/swap.c | 2 +- 7 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index a31bbf5c8bbc..d84908a177bd 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -151,9 +151,6 @@ /* Helpers */ #define TO_KB(bytes) ((bytes) >> 10) #define TO_MB(bytes) (TO_KB(bytes) >> 10) -#define PAGES_TO_KB(n_pages) ((n_pages) << (PAGE_SHIFT - 10)) -#define PAGES_TO_MB(n_pages) (PAGES_TO_KB(n_pages) >> 10) - /* *************************************************************** diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ae97a0b8ec7..b626d1bacef5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -69,6 +69,15 @@ static inline void totalram_pages_add(long count) extern void * high_memory; +/* + * Convert between pages and MB + * 20 is the shift for 1MB (2^20 = 1MB) + * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages) + * So (20 - PAGE_SHIFT) converts between pages and MB + */ +#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT)) +#define MB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index b521d0455992..7484d8ad5767 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -796,7 +796,7 @@ kfree_scale_thread(void *arg) pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), - (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + PAGES_TO_MB(mem_begin - mem_during)); if (shutdown) { smp_mb(); /* Assign before wake. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82c8d804c54c..e256793b9a08 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1495,7 +1495,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) * by the PTE scanner and NUMA hinting faults should be trapped based * on resident pages */ - nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size); rss = get_mm_rss(p->mm); if (!rss) rss = nr_scan_pages; @@ -1934,8 +1934,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, } def_th = sysctl_numa_balancing_hot_threshold; - rate_limit = sysctl_numa_balancing_promote_rate_limit << \ - (20 - PAGE_SHIFT); + rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit); numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); th = pgdat->nbp_threshold ? : def_th; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..e4d578e6121c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -510,7 +510,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work) /* * Initial write bandwidth: 100 MB/s */ -#define INIT_BW (100 << (20 - PAGE_SHIFT)) +#define INIT_BW MB_TO_PAGES(100) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9c38a95e9f09..2b4ea5a2ce7d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -911,7 +911,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < MB_TO_PAGES(512)) { transparent_hugepage_flags = 0; return 0; } diff --git a/mm/swap.c b/mm/swap.c index 3632dd061beb..cb164f9ef9e3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1096,7 +1096,7 @@ static const struct ctl_table swap_sysctl_table[] = { */ void __init swap_setup(void) { - unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); + unsigned long megs = PAGES_TO_MB(totalram_pages()); /* Use a smaller cluster for small-memory machines */ if (megs < 16) -- cgit v1.2.3 From cc483b328881bbccb55265a86731384d5176fe85 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 4 Aug 2025 16:33:48 -0700 Subject: mm: limit the scope of vma_start_read() Limit the scope of vma_start_read() as it is used only as a helper for higher-level locking functions implemented inside mmap_lock.c and we are about to introduce more complex RCU rules for this function. The change is pure code refactoring and has no functional changes. Link: https://lkml.kernel.org/r/20250804233349.1278678-1-surenb@google.com Suggested-by: Vlastimil Babka Signed-off-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 85 ----------------------------------------------- mm/mmap_lock.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 85 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 11a078de9150..2c9fffa58714 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -147,91 +147,6 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) } } -/* - * Try to read-lock a vma. The function is allowed to occasionally yield false - * locked result to avoid performance overhead, in which case we fall back to - * using mmap_lock. The function should never yield false unlocked result. - * False locked result is possible if mm_lock_seq overflows or if vma gets - * reused and attached to a different mm before we lock it. - * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got - * detached. - * - * WARNING! The vma passed to this function cannot be used if the function - * fails to lock it because in certain cases RCU lock is dropped and then - * reacquired. Once RCU lock is dropped the vma can be concurently freed. - */ -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - int oldcnt; - - /* - * Check before locking. A race might cause false locked result. - * We can use READ_ONCE() for the mm_lock_seq here, and don't need - * ACQUIRE semantics, because this is just a lockless check whose result - * we don't rely on for anything - the mm_lock_seq read against which we - * need ordering is below. - */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) - return NULL; - - /* - * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() - * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. - * Acquire fence is required here to avoid reordering against later - * vm_lock_seq check and checks inside lock_vma_under_rcu(). - */ - if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) { - /* return EAGAIN if vma got detached from under us */ - return oldcnt ? NULL : ERR_PTR(-EAGAIN); - } - - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); - - /* - * If vma got attached to another mm from under us, that mm is not - * stable and can be freed in the narrow window after vma->vm_refcnt - * is dropped and before rcuwait_wake_up(mm) is called. Grab it before - * releasing vma->vm_refcnt. - */ - if (unlikely(vma->vm_mm != mm)) { - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ - struct mm_struct *other_mm = vma->vm_mm; - - /* - * __mmdrop() is a heavy operation and we don't need RCU - * protection here. Release RCU lock during these operations. - * We reinstate the RCU read lock as the caller expects it to - * be held when this function returns even on error. - */ - rcu_read_unlock(); - mmgrab(other_mm); - vma_refcount_put(vma); - mmdrop(other_mm); - rcu_read_lock(); - return NULL; - } - - /* - * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. - * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq - * modification invalidates all existing locks. - * - * We must use ACQUIRE semantics for the mm_lock_seq so that if we are - * racing with vma_end_write_all(), we only start reading from the VMA - * after it has been unlocked. - * This pairs with RELEASE semantics in vma_end_write_all(). - */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { - vma_refcount_put(vma); - return NULL; - } - - return vma; -} - /* * Use only while holding mmap read lock which guarantees that locking will not * fail (nobody can concurrently write-lock the vma). vma_start_read() should diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index b006cec8e6fe..10826f347a9f 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -127,6 +127,91 @@ void vma_mark_detached(struct vm_area_struct *vma) } } +/* + * Try to read-lock a vma. The function is allowed to occasionally yield false + * locked result to avoid performance overhead, in which case we fall back to + * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. + * + * WARNING! The vma passed to this function cannot be used if the function + * fails to lock it because in certain cases RCU lock is dropped and then + * reacquired. Once RCU lock is dropped the vma can be concurently freed. + */ +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + int oldcnt; + + /* + * Check before locking. A race might cause false locked result. + * We can use READ_ONCE() for the mm_lock_seq here, and don't need + * ACQUIRE semantics, because this is just a lockless check whose result + * we don't rely on for anything - the mm_lock_seq read against which we + * need ordering is below. + */ + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) + return NULL; + + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + return oldcnt ? NULL : ERR_PTR(-EAGAIN); + } + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + if (unlikely(vma->vm_mm != mm)) { + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *other_mm = vma->vm_mm; + + /* + * __mmdrop() is a heavy operation and we don't need RCU + * protection here. Release RCU lock during these operations. + * We reinstate the RCU read lock as the caller expects it to + * be held when this function returns even on error. + */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); + rcu_read_lock(); + return NULL; + } + + /* + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq + * modification invalidates all existing locks. + * + * We must use ACQUIRE semantics for the mm_lock_seq so that if we are + * racing with vma_end_write_all(), we only start reading from the VMA + * after it has been unlocked. + * This pairs with RELEASE semantics in vma_end_write_all(). + */ + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); + return NULL; + } + + return vma; +} + /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the -- cgit v1.2.3 From 913fff314547c1922002e655bb25199ee38e8825 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 7 Aug 2025 00:17:47 +0800 Subject: mm, swap: remove fragment clusters counter It was used for calculating the iteration number when the swap allocator wants to scan the whole fragment list. Now the allocator only scans one fragment cluster at a time, so no one uses this counter anymore. Remove it as a cleanup; the performance change is marginal: Build linux kernel using 10G ZRAM, make -j96, defconfig with 2G cgroup memory limit, on top of tmpfs, 64kB mTHP enabled: Before: sys time: 6278.45s After: sys time: 6176.34s Change to 8G ZRAM: Before: sys time: 5572.85s After: sys time: 5531.49s Link: https://lkml.kernel.org/r/20250806161748.76651-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Nhat Pham Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - mm/swapfile.c | 7 ------- 2 files changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe6ed2cc3fd..a060d102e0d1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -310,7 +310,6 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f1110e37f68..5fdb3cb2b8b7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -470,11 +470,6 @@ static void move_cluster(struct swap_info_struct *si, else list_move_tail(&ci->list, list); spin_unlock(&si->lock); - - if (ci->flags == CLUSTER_FLAG_FRAG) - atomic_long_dec(&si->frag_cluster_nr[ci->order]); - else if (new_flags == CLUSTER_FLAG_FRAG) - atomic_long_inc(&si->frag_cluster_nr[ci->order]); ci->flags = new_flags; } @@ -965,7 +960,6 @@ new_cluster: * allocation, but reclaim may drop si->lock and race with another user. */ while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { - atomic_long_dec(&si->frag_cluster_nr[o]); found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), 0, usage); if (found) @@ -3217,7 +3211,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - atomic_long_set(&si->frag_cluster_nr[i], 0); } /* -- cgit v1.2.3 From 61dc4358d37ae0be3220a0fa32cf7f0ccd4f7636 Mon Sep 17 00:00:00 2001 From: "Adrian Huang (Lenovo)" Date: Wed, 6 Aug 2025 22:59:06 +0800 Subject: mm: correct misleading comment on mmap_lock field in mm_struct The comment previously described the offset of mmap_lock as 0x120 (hex), which is misleading. The correct offset is 56 bytes (decimal) from the last cache line boundary. Using '0x120' could confuse readers trying to understand why the count and owner fields reside in separate cachelines. This change also removes an unnecessary space for improved formatting. Link: https://lkml.kernel.org/r/20250806145906.24647-1-adrianhuang0701@gmail.com Signed-off-by: Adrian Huang (Lenovo) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db93..3ed763e7ec6f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1026,10 +1026,10 @@ struct mm_struct { * counters */ /* - * With some kernel config, the current mmap_lock's offset - * inside 'mm_struct' is at 0x120, which is very optimal, as + * Typically the current mmap_lock's offset is 56 bytes from + * the last cacheline boundary, which is very optimal, as * its two hot fields 'count' and 'owner' sit in 2 different - * cachelines, and when mmap_lock is highly contended, both + * cachelines, and when mmap_lock is highly contended, both * of the 2 fields will be accessed frequently, current layout * will help to reduce cache bouncing. * -- cgit v1.2.3 From 4c5d3365882dbbc0784688784904f440d7a4c0f1 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Wed, 6 Aug 2025 14:41:08 +0200 Subject: mm/vmalloc: allow to set node and align in vrealloc Patch series "support large align and nid in Rust allocators", v15. The series provides the ability for Rust allocators to set NUMA node and large alignment. This patch (of 4): Reimplement vrealloc() to be able to set node and alignment should a user need to do so. Rename the function to vrealloc_node_align() to better match what it actually does now and introduce macros for vrealloc() and friends for backward compatibility. With that change we also provide the ability for the Rust part of the kernel to set node and alignment in its allocations. Link: https://lkml.kernel.org/r/20250806124034.1724515-1-vitaly.wool@konsulko.se Link: https://lkml.kernel.org/r/20250806124108.1724561-1-vitaly.wool@konsulko.se Signed-off-by: Vitaly Wool Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Vlastimil Babka Cc: Alice Ryhl Cc: Danilo Krummrich Cc: Herbert Xu Cc: Jann Horn Cc: Kent Overstreet Cc: Liam Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 12 +++++++++--- mm/nommu.c | 3 ++- mm/vmalloc.c | 29 ++++++++++++++++++++++++----- 3 files changed, 35 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 2759dac6be44..eb54b7b3202f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -197,9 +197,15 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1 extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) -void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags) - __realloc_size(2); -#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) +void *__must_check vrealloc_node_align_noprof(const void *p, size_t size, + unsigned long align, gfp_t flags, int nid) __realloc_size(2); +#define vrealloc_node_noprof(_p, _s, _f, _nid) \ + vrealloc_node_align_noprof(_p, _s, 1, _f, _nid) +#define vrealloc_noprof(_p, _s, _f) \ + vrealloc_node_align_noprof(_p, _s, 1, _f, NUMA_NO_NODE) +#define vrealloc_node_align(...) alloc_hooks(vrealloc_node_align_noprof(__VA_ARGS__)) +#define vrealloc_node(...) alloc_hooks(vrealloc_node_noprof(__VA_ARGS__)) +#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/mm/nommu.c b/mm/nommu.c index 8b819fafd57b..6fff462f90d3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -119,7 +119,8 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc_noprof); -void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int node) { return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dbcdceecae1..e299b51bd922 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4089,19 +4089,29 @@ void *vzalloc_node_noprof(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node_noprof); /** - * vrealloc - reallocate virtually contiguous memory; contents remain unchanged + * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents + * remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate + * @align: requested alignment * @flags: the flags for the page level allocator + * @nid: node number of the target node + * + * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * - * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and - * @p is not a %NULL pointer, the object pointed to is freed. + * If the caller wants the new memory to be on specific node *only*, + * __GFP_THISNODE flag should be set, otherwise the function will try to avoid + * reallocation and possibly disregard the specified @nid. * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that * __GFP_ZERO is not fully honored by this API. * + * Requesting an alignment that is bigger than the alignment of the existing + * allocation will fail. + * * In any case, the contents of the object pointed to are preserved up to the * lesser of the new and old sizes. * @@ -4111,7 +4121,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); * Return: pointer to the allocated memory; %NULL if @size is zero or in case of * failure */ -void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) { struct vm_struct *vm = NULL; size_t alloced_size = 0; @@ -4135,6 +4146,12 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) if (WARN(alloced_size < old_size, "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) return NULL; + if (WARN(!IS_ALIGNED((unsigned long)p, align), + "will not reallocate with a bigger alignment (0x%lx)\n", align)) + return NULL; + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(vmalloc_to_page(p))) + goto need_realloc; } /* @@ -4165,8 +4182,10 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) return (void *)p; } +need_realloc: /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ - n = __vmalloc_noprof(size, flags); + n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0)); + if (!n) return NULL; -- cgit v1.2.3 From 2cd8231796b5e7133b1c3d66ad7d2a3c42c97258 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Wed, 6 Aug 2025 14:41:47 +0200 Subject: mm/slub: allow to set node and align in k[v]realloc Reimplement k[v]realloc_node() to be able to set node and alignment should a user need to do so. In order to do that while retaining the maximal backward compatibility, add k[v]realloc_node_align() functions and redefine the rest of API using these new ones. While doing that, we also keep the number of _noprof variants to a minimum, which implies some changes to the existing users of older _noprof functions, that basically being bcachefs. With that change we also provide the ability for the Rust part of the kernel to set node and alignment in its K[v]xxx [re]allocations. Link: https://lkml.kernel.org/r/20250806124147.1724658-1-vitaly.wool@konsulko.se Signed-off-by: Vitaly Wool Reviewed-by: Vlastimil Babka Cc: Alice Ryhl Cc: Danilo Krummrich Cc: Herbert Xu Cc: Jann Horn Cc: Kent Overstreet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- fs/bcachefs/darray.c | 2 +- fs/bcachefs/util.h | 2 +- include/linux/bpfptr.h | 2 +- include/linux/slab.h | 39 ++++++++++++++++++++------------- lib/rhashtable.c | 4 ++-- mm/slub.c | 59 ++++++++++++++++++++++++++++++++++++++------------ 6 files changed, 74 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c index e86d36d23e9e..928e83a1ce42 100644 --- a/fs/bcachefs/darray.c +++ b/fs/bcachefs/darray.c @@ -21,7 +21,7 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_ return -ENOMEM; void *data = likely(bytes < INT_MAX) - ? kvmalloc_noprof(bytes, gfp) + ? kvmalloc_node_align_noprof(bytes, 1, gfp, NUMA_NO_NODE) : vmalloc_noprof(bytes); if (!data) return -ENOMEM; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 6488f098d140..7112fd40ee21 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -61,7 +61,7 @@ static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) { void *p = unlikely(n >= INT_MAX) ? vmalloc_noprof(n) - : kvmalloc_noprof(n, flags & ~__GFP_ZERO); + : kvmalloc_node_align_noprof(n, 1, flags & ~__GFP_ZERO, NUMA_NO_NODE); if (p && (flags & __GFP_ZERO)) memset(p, 0, n); return p; diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h index 1af241525a17..f6e0795db484 100644 --- a/include/linux/bpfptr.h +++ b/include/linux/bpfptr.h @@ -67,7 +67,7 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset, static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len) { - void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN); + void *p = kvmalloc_node_align_noprof(len, 1, GFP_USER | __GFP_NOWARN, NUMA_NO_NODE); if (!p) return ERR_PTR(-ENOMEM); diff --git a/include/linux/slab.h b/include/linux/slab.h index d5a8ab98035c..6dc300bac2a1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -465,9 +465,13 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc_noprof(const void *objp, size_t new_size, - gfp_t flags) __realloc_size(2); -#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) +void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size, + unsigned long align, + gfp_t flags, int nid) __realloc_size(2); +#define krealloc_noprof(_o, _s, _f) krealloc_node_align_noprof(_o, _s, 1, _f, NUMA_NO_NODE) +#define krealloc_node_align(...) alloc_hooks(krealloc_node_align_noprof(__VA_ARGS__)) +#define krealloc_node(_o, _s, _f, _n) krealloc_node_align(_o, _s, 1, _f, _n) +#define krealloc(...) krealloc_node(__VA_ARGS__, NUMA_NO_NODE) void kfree(const void *objp); void kfree_sensitive(const void *objp); @@ -1041,18 +1045,20 @@ static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags) #define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__)) #define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node) -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __alloc_size(1); -#define kvmalloc_node_noprof(size, flags, node) \ - __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node) -#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__)) - -#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE) -#define kvmalloc_noprof(_size, _flags) kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE) +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) __alloc_size(1); +#define kvmalloc_node_align_noprof(_size, _align, _flags, _node) \ + __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, NULL), _align, _flags, _node) +#define kvmalloc_node_align(...) \ + alloc_hooks(kvmalloc_node_align_noprof(__VA_ARGS__)) +#define kvmalloc_node(_s, _f, _n) kvmalloc_node_align(_s, 1, _f, _n) +#define kvmalloc(...) kvmalloc_node(__VA_ARGS__, NUMA_NO_NODE) #define kvzalloc(_size, _flags) kvmalloc(_size, (_flags)|__GFP_ZERO) #define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node) + #define kmem_buckets_valloc(_b, _size, _flags) \ - alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) + alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), 1, _flags, NUMA_NO_NODE)) static inline __alloc_size(1, 2) void * kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) @@ -1062,7 +1068,7 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - return kvmalloc_node_noprof(bytes, flags, node); + return kvmalloc_node_align_noprof(bytes, 1, flags, node); } #define kvmalloc_array_noprof(...) kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE) @@ -1073,9 +1079,12 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) #define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__)) #define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__)) -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) - __realloc_size(2); -#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) __realloc_size(2); +#define kvrealloc_node_align(...) \ + alloc_hooks(kvrealloc_node_align_noprof(__VA_ARGS__)) +#define kvrealloc_node(_p, _s, _f, _n) kvrealloc_node_align(_p, _s, 1, _f, _n) +#define kvrealloc(...) kvrealloc_node(__VA_ARGS__, NUMA_NO_NODE) extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 3e555d012ed6..fde0f0e556f8 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -184,8 +184,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, static struct lock_class_key __key; tbl = alloc_hooks_tag(ht->alloc_tag, - kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets), - gfp|__GFP_ZERO, NUMA_NO_NODE)); + kvmalloc_node_align_noprof(struct_size(tbl, buckets, nbuckets), + 1, gfp|__GFP_ZERO, NUMA_NO_NODE)); size = nbuckets; diff --git a/mm/slub.c b/mm/slub.c index 30003763d224..8dbeabc6a0f0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4881,7 +4881,7 @@ void kfree(const void *object) EXPORT_SYMBOL(kfree); static __always_inline __realloc_size(2) void * -__do_krealloc(const void *p, size_t new_size, gfp_t flags) +__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid) { void *ret; size_t ks = 0; @@ -4895,6 +4895,16 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) if (!kasan_check_byte(p)) return NULL; + /* + * If reallocation is not necessary (e. g. the new size is less + * than the current allocated size), the current allocation will be + * preserved unless __GFP_THISNODE is set. In the latter case a new + * allocation on the requested node will be attempted. + */ + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(virt_to_page(p))) + goto alloc_new; + if (is_kfence_address(p)) { ks = orig_size = kfence_ksize(p); } else { @@ -4917,6 +4927,10 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) if (new_size > ks) goto alloc_new; + /* If the old object doesn't satisfy the new alignment, allocate a new one */ + if (!IS_ALIGNED((unsigned long)p, align)) + goto alloc_new; + /* Zero out spare memory. */ if (want_init_on_alloc(flags)) { kasan_disable_current(); @@ -4939,7 +4953,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; alloc_new: - ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); + ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_); if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); @@ -4951,14 +4965,19 @@ alloc_new: } /** - * krealloc - reallocate memory. The contents will remain unchanged. + * krealloc_node_align - reallocate memory. The contents will remain unchanged. * @p: object to reallocate memory for. * @new_size: how many bytes of memory are required. + * @align: desired alignment. * @flags: the type of memory to allocate. + * @nid: NUMA node or NUMA_NO_NODE * * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that @@ -4983,7 +5002,8 @@ alloc_new: * * Return: pointer to the allocated memory or %NULL in case of error */ -void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) +void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align, + gfp_t flags, int nid) { void *ret; @@ -4992,13 +5012,13 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) return ZERO_SIZE_PTR; } - ret = __do_krealloc(p, new_size, flags); + ret = __do_krealloc(p, new_size, align, flags, nid); if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) kfree(p); return ret; } -EXPORT_SYMBOL(krealloc_noprof); +EXPORT_SYMBOL(krealloc_node_align_noprof); static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) { @@ -5029,9 +5049,13 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @b: which set of kmalloc buckets to allocate from. + * @align: desired alignment. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * @@ -5041,7 +5065,8 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * * Return: pointer to the allocated memory of %NULL in case of failure */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) { void *ret; @@ -5071,7 +5096,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) * about the resulting pointer, and cannot play * protection games. */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } @@ -5115,14 +5140,19 @@ void kvfree_sensitive(const void *addr, size_t len) EXPORT_SYMBOL(kvfree_sensitive); /** - * kvrealloc - reallocate memory; contents remain unchanged + * kvrealloc_node_align - reallocate memory; contents remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate + * @align: desired alignment * @flags: the flags for the page level allocator + * @nid: NUMA node id * * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 * and @p is not a %NULL pointer, the object pointed to is freed. * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that @@ -5136,17 +5166,18 @@ EXPORT_SYMBOL(kvfree_sensitive); * * Return: pointer to the allocated memory or %NULL in case of error */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) { void *n; if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); + return vrealloc_node_align_noprof(p, size, align, flags, nid); - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid); if (!n) { /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); + n = kvmalloc_node_align_noprof(size, align, flags, nid); if (!n) return NULL; @@ -5162,7 +5193,7 @@ void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) return n; } -EXPORT_SYMBOL(kvrealloc_noprof); +EXPORT_SYMBOL(kvrealloc_node_align_noprof); struct detached_freelist { struct slab *slab; -- cgit v1.2.3 From e6d7d3502e00ed6f86e03dcdb282cb7785e55448 Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Tue, 5 Aug 2025 21:39:40 +0900 Subject: mm/damon: update expired description of damos_action Nowadays, damos operation actions support a greater operation set. But comments (also, generated documentation) weren't updated. So fix the comments with current support status. Link: https://lkml.kernel.org/r/20250805123940.13691-1-ekffu200098@gmail.com Signed-off-by: Sang-Heon Jeon Reviewed-by: SeongJae Park Cc: Honggyu Kim Signed-off-by: Andrew Morton --- include/linux/damon.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index f13664c62ddd..d01bfee80bd6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -110,7 +110,7 @@ struct damon_target { * * @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED. * @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD. - * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. + * @DAMOS_PAGEOUT: Reclaim the region. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. @@ -121,10 +121,10 @@ struct damon_target { * @NR_DAMOS_ACTIONS: Total number of DAMOS actions * * The support of each action is up to running &struct damon_operations. - * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except - * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR - * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum - * DAMOS_LRU_DEPRIO, and &DAMOS_STAT. + * Refer to 'Operation Action' section of Documentation/mm/damon/design.rst for + * status of the supports. + * + * Note that DAMOS_PAGEOUT doesn't trigger demotions. */ enum damos_action { DAMOS_WILLNEED, -- cgit v1.2.3 From 95c2908f1a4fd608b1cdbb5acef3572e5d769e1c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 16:39:47 +0200 Subject: mm/migrate: remove MIGRATEPAGE_UNMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migrate_folio_unmap() is the only user of MIGRATEPAGE_UNMAP. We want to remove MIGRATEPAGE_* completely. It's rather weird to have a generic MIGRATEPAGE_UNMAP, documented to be returned from address-space callbacks, when it's only used for an internal helper. Let's start by having only a single "success" return value for migrate_folio_unmap() -- 0 -- by moving the "folio was already freed" check into the single caller. There is a remaining comment for PG_isolated, which we renamed to PG_movable_ops_isolated recently and forgot to update. While we might still run into that case with zsmalloc, it's something we want to get rid of soon. So let's just focus that optimization on real folios only for now by excluding movable_ops pages. Note that concurrent freeing can happen at any time and this "already freed" check is not relevant for correctness. [david@redhat.com: no need to pass "reason" to migrate_folio_unmap(), per Lance] Link: https://lkml.kernel.org/r/3bb725f8-28d7-4aa2-b75f-af40d5cab280@redhat.com Link: https://lkml.kernel.org/r/20250811143949.1117439-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Lance Yang Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Benjamin LaHaise Cc: Byungchul Park Cc: Chris Mason Cc: Christian Brauner Cc: Christophe Leroy Cc: Dave Kleikamp Cc: David Sterba Cc: Eugenio Pé rez Cc: Greg Kroah-Hartman Cc: Gregory Price Cc: "Huang, Ying" Cc: Jan Kara Cc: Jason Wang Cc: Jerrin Shaji George Cc: Josef Bacik Cc: Joshua Hahn Cc: Madhavan Srinivasan Cc: Mathew Brost Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Rakie Kim Cc: Sergey Senozhatsky Cc: Xuan Zhuo Cc: Dave Kleikamp Signed-off-by: Andrew Morton --- include/linux/migrate.h | 1 - mm/migrate.c | 45 ++++++++++++++++++++++----------------------- 2 files changed, 22 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9009e27b5f44..302f3e95faea 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -18,7 +18,6 @@ struct migration_target_control; * - zero on page migration success; */ #define MIGRATEPAGE_SUCCESS 0 -#define MIGRATEPAGE_UNMAP 1 /** * struct movable_operations - Driver page migration diff --git a/mm/migrate.c b/mm/migrate.c index 9e5ef39ce73a..2ef467eae31d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1189,7 +1189,7 @@ static void migrate_folio_done(struct folio *src, static int migrate_folio_unmap(new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio **dstp, enum migrate_mode mode, - enum migrate_reason reason, struct list_head *ret) + struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; @@ -1198,16 +1198,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, bool locked = false; bool dst_locked = false; - if (folio_ref_count(src) == 1) { - /* Folio was freed from under us. So we are done. */ - folio_clear_active(src); - folio_clear_unevictable(src); - /* free_pages_prepare() will clear PG_isolated. */ - list_del(&src->lru); - migrate_folio_done(src, reason); - return MIGRATEPAGE_SUCCESS; - } - dst = get_new_folio(src, private); if (!dst) return -ENOMEM; @@ -1297,7 +1287,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (unlikely(page_has_movable_ops(&src->page))) { __migrate_folio_record(dst, old_page_state, anon_vma); - return MIGRATEPAGE_UNMAP; + return 0; } /* @@ -1327,7 +1317,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (!folio_mapped(src)) { __migrate_folio_record(dst, old_page_state, anon_vma); - return MIGRATEPAGE_UNMAP; + return 0; } out: @@ -1870,14 +1860,27 @@ static int migrate_pages_batch(struct list_head *from, continue; } + /* + * If we are holding the last folio reference, the folio + * was freed from under us, so just drop our reference. + */ + if (likely(!page_has_movable_ops(&folio->page)) && + folio_ref_count(folio) == 1) { + folio_clear_active(folio); + folio_clear_unevictable(folio); + list_del(&folio->lru); + migrate_folio_done(folio, reason); + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; + continue; + } + rc = migrate_folio_unmap(get_new_folio, put_new_folio, - private, folio, &dst, mode, reason, - ret_folios); + private, folio, &dst, mode, ret_folios); /* * The rules are: - * Success: folio will be freed - * Unmap: folio will be put on unmap_folios list, - * dst folio put on dst_folios list + * 0: folio will be put on unmap_folios list, + * dst folio put on dst_folios list * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list @@ -1927,11 +1930,7 @@ static int migrate_pages_batch(struct list_head *from, thp_retry += is_thp; nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: - stats->nr_succeeded += nr_pages; - stats->nr_thp_succeeded += is_thp; - break; - case MIGRATEPAGE_UNMAP: + case 0: list_move_tail(&folio->lru, &unmap_folios); list_add_tail(&dst->lru, &dst_folios); break; -- cgit v1.2.3 From fb49a4425cfa163faccd91f913773d3401d3a7d4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 16:39:48 +0200 Subject: treewide: remove MIGRATEPAGE_SUCCESS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At this point MIGRATEPAGE_SUCCESS is misnamed for all folio users, and now that we remove MIGRATEPAGE_UNMAP, it's really the only "success" return value that the code uses and expects. Let's just get rid of MIGRATEPAGE_SUCCESS completely and just use "0" for success. Link: https://lkml.kernel.org/r/20250811143949.1117439-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan [mm] Acked-by: Dave Kleikamp [jfs] Acked-by: David Sterba [btrfs] Acked-by: Greg Kroah-Hartman Reviewed-by: Byungchul Park Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Benjamin LaHaise Cc: Chris Mason Cc: Christian Brauner Cc: Christophe Leroy Cc: Dave Kleikamp Cc: Eugenio Pé rez Cc: Gregory Price Cc: "Huang, Ying" Cc: Jan Kara Cc: Jason Wang Cc: Jerrin Shaji George Cc: Josef Bacik Cc: Joshua Hahn Cc: Madhavan Srinivasan Cc: Mathew Brost Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Rakie Kim Cc: Sergey Senozhatsky Cc: Xuan Zhuo Cc: Lance Yang Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/cmm.c | 2 +- drivers/misc/vmw_balloon.c | 4 ++-- drivers/virtio/virtio_balloon.c | 2 +- fs/aio.c | 2 +- fs/btrfs/inode.c | 4 ++-- fs/hugetlbfs/inode.c | 4 ++-- fs/jfs/jfs_metapage.c | 8 ++++---- include/linux/migrate.h | 10 +-------- mm/migrate.c | 40 +++++++++++++++++------------------- mm/migrate_device.c | 2 +- mm/zsmalloc.c | 4 ++-- 11 files changed, 36 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 5e0a718d1be7..0823fa2da151 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -545,7 +545,7 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, /* balloon page list reference */ put_page(page); - return MIGRATEPAGE_SUCCESS; + return 0; } static void cmm_balloon_compaction_init(void) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 6653fc53c951..6df51ee8db62 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1806,7 +1806,7 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, * the list after acquiring the lock. */ get_page(newpage); - ret = MIGRATEPAGE_SUCCESS; + ret = 0; } /* Update the balloon list under the @pages_lock */ @@ -1817,7 +1817,7 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, * If we succeed just insert it to the list and update the statistics * under the lock. */ - if (ret == MIGRATEPAGE_SUCCESS) { + if (!ret) { balloon_page_insert(&b->b_dev_info, newpage); __count_vm_event(BALLOON_MIGRATE); } diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index e299e18346a3..eae65136cdfb 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -875,7 +875,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, balloon_page_finalize(page); put_page(page); /* balloon reference */ - return MIGRATEPAGE_SUCCESS; + return 0; } #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/fs/aio.c b/fs/aio.c index 7fc7b6221312..059e03cfa088 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -445,7 +445,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, folio_get(dst); rc = folio_migrate_mapping(mapping, dst, src, 1); - if (rc != MIGRATEPAGE_SUCCESS) { + if (rc) { folio_put(dst); goto out_unlock; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd82dcc7b2b7..0bb604dbd673 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7421,7 +7421,7 @@ static int btrfs_migrate_folio(struct address_space *mapping, { int ret = filemap_migrate_folio(mapping, dst, src, mode); - if (ret != MIGRATEPAGE_SUCCESS) + if (ret) return ret; if (folio_test_ordered(src)) { @@ -7429,7 +7429,7 @@ static int btrfs_migrate_folio(struct address_space *mapping, folio_set_ordered(dst); } - return MIGRATEPAGE_SUCCESS; + return 0; } #else #define btrfs_migrate_folio NULL diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 09d4baef29cf..34d496a2b7de 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1052,7 +1052,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, int rc; rc = migrate_huge_page_move_mapping(mapping, dst, src); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (hugetlb_folio_subpool(src)) { @@ -1063,7 +1063,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return 0; } #else #define hugetlbfs_migrate_folio NULL diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index b98cf3bb6c1f..871cf4fb3636 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -169,7 +169,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, } rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; for (i = 0; i < MPS_PER_PAGE; i++) { @@ -199,7 +199,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, } } - return MIGRATEPAGE_SUCCESS; + return 0; } #endif /* CONFIG_MIGRATION */ @@ -242,7 +242,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, return -EAGAIN; rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (unlikely(insert_metapage(dst, mp))) @@ -253,7 +253,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, mp->folio = dst; remove_metapage(src, mp); - return MIGRATEPAGE_SUCCESS; + return 0; } #endif /* CONFIG_MIGRATION */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 302f3e95faea..1f0ac122c3bf 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -12,13 +12,6 @@ typedef void free_folio_t(struct folio *folio, unsigned long private); struct migration_target_control; -/* - * Return values from addresss_space_operations.migratepage(): - * - negative errno on page migration failure; - * - zero on page migration success; - */ -#define MIGRATEPAGE_SUCCESS 0 - /** * struct movable_operations - Driver page migration * @isolate_page: @@ -34,8 +27,7 @@ struct migration_target_control; * @src page. The driver should copy the contents of the * @src page to the @dst page and set up the fields of @dst page. * Both pages are locked. - * If page migration is successful, the driver should - * return MIGRATEPAGE_SUCCESS. + * If page migration is successful, the driver should return 0. * If the driver cannot migrate the page at the moment, it can return * -EAGAIN. The VM interprets this as a temporary migration failure and * will retry it later. Any other error value is a permanent migration diff --git a/mm/migrate.c b/mm/migrate.c index 2ef467eae31d..8e435a078fc3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -231,18 +231,17 @@ static void putback_movable_ops_page(struct page *page) * src and dst are also released by migration core. These pages will not be * folios in the future, so that must be reworked. * - * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error - * code. + * Returns 0 on success, otherwise a negative error code. */ static int migrate_movable_ops_page(struct page *dst, struct page *src, enum migrate_mode mode) { - int rc = MIGRATEPAGE_SUCCESS; + int rc; VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src); VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src); rc = page_movable_ops(src)->migrate_page(dst, src, mode); - if (rc == MIGRATEPAGE_SUCCESS) + if (!rc) ClearPageMovableOpsIsolated(src); return rc; } @@ -587,7 +586,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); - return MIGRATEPAGE_SUCCESS; + return 0; } oldzone = folio_zone(folio); @@ -688,7 +687,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, } local_irq_enable(); - return MIGRATEPAGE_SUCCESS; + return 0; } int folio_migrate_mapping(struct address_space *mapping, @@ -737,7 +736,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, xas_unlock_irq(&xas); - return MIGRATEPAGE_SUCCESS; + return 0; } /* @@ -853,14 +852,14 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst, return rc; rc = __folio_migrate_mapping(mapping, dst, src, expected_count); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (src_private) folio_attach_private(dst, folio_detach_private(src)); folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return 0; } /** @@ -967,7 +966,7 @@ recheck_buffers: } rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) goto unlock_buffers; bh = head; @@ -1071,7 +1070,7 @@ static int fallback_migrate_folio(struct address_space *mapping, * * Return value: * < 0 - error code - * MIGRATEPAGE_SUCCESS - success + * 0 - success */ static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1099,7 +1098,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, else rc = fallback_migrate_folio(mapping, dst, src, mode); - if (rc == MIGRATEPAGE_SUCCESS) { + if (!rc) { /* * For pagecache folios, src->mapping must be cleared before src * is freed. Anonymous folios must stay anonymous until freed. @@ -1449,7 +1448,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ folio_putback_hugetlb(src); - return MIGRATEPAGE_SUCCESS; + return 0; } dst = get_new_folio(src, private); @@ -1512,8 +1511,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) - remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); + remove_migration_ptes(src, !rc ? dst : src, 0); unlock_put_anon: folio_unlock(dst); @@ -1522,7 +1520,7 @@ put_anon: if (anon_vma) put_anon_vma(anon_vma); - if (rc == MIGRATEPAGE_SUCCESS) { + if (!rc) { move_hugetlb_state(src, dst, reason); put_new_folio = NULL; } @@ -1530,7 +1528,7 @@ put_anon: out_unlock: folio_unlock(src); out: - if (rc == MIGRATEPAGE_SUCCESS) + if (!rc) folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); @@ -1640,7 +1638,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, reason, ret_folios); /* * The rules are: - * Success: hugetlb folio will be put back + * 0: hugetlb folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list @@ -1657,7 +1655,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, retry++; nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: + case 0: stats->nr_succeeded += nr_pages; break; default: @@ -1711,7 +1709,7 @@ static void migrate_folios_move(struct list_head *src_folios, reason, ret_folios); /* * The rules are: - * Success: folio will be freed + * 0: folio will be freed * -EAGAIN: stay on the unmap_folios list * Other errno: put on ret_folios list */ @@ -1721,7 +1719,7 @@ static void migrate_folios_move(struct list_head *src_folios, *thp_retry += is_thp; *nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: + case 0: stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; break; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index e05e14d6eacd..abd9f6850db6 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -778,7 +778,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (migrate && migrate->fault_page == page) extra_cnt = 1; r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); - if (r != MIGRATEPAGE_SUCCESS) + if (r) src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; else folio_migrate_flags(newfolio, folio); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 805a10b41266..153783d49d34 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1746,7 +1746,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, * instead. */ if (!zpdesc->zspage) - return MIGRATEPAGE_SUCCESS; + return 0; /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(zpdesc); @@ -1813,7 +1813,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, reset_zpdesc(zpdesc); zpdesc_put(zpdesc); - return MIGRATEPAGE_SUCCESS; + return 0; } static void zs_page_putback(struct page *page) -- cgit v1.2.3 From b22cc9a9c7ff0ad8998d58fdd7122de6038c46a7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:27 +0200 Subject: mm/rmap: convert "enum rmap_level" to "enum pgtable_level" Let's factor it out, and convert all checks for unsupported levels to BUILD_BUG(). The code is written in a way such that force-inlining will optimize out the levels. [nathan@kernel.org: always inline __folio_rmap_sanity_checks()] Link: https://lkml.kernel.org/r/20250814-rmap-fix-build_bug-conversion-v1-1-fb7b10a0b362@kernel.org Link: https://lkml.kernel.org/r/20250811112631.759341-8-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Nathan Chancellor Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Oscar Salvador Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 8 +++++++ include/linux/rmap.h | 62 ++++++++++++++++++++++--------------------------- mm/rmap.c | 56 ++++++++++++++++++++++++-------------------- 3 files changed, 67 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2b80fd456c8b..4f88e460eb9c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1975,6 +1975,14 @@ static inline bool arch_has_pfn_modify_check(void) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; +enum pgtable_level { + PGTABLE_LEVEL_PTE = 0, + PGTABLE_LEVEL_PMD, + PGTABLE_LEVEL_PUD, + PGTABLE_LEVEL_P4D, + PGTABLE_LEVEL_PGD, +}; + #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6cd020eea37a..e8aff6d2deda 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -394,18 +394,8 @@ typedef int __bitwise rmap_t; /* The anonymous (sub)page is exclusive to a single process. */ #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) -/* - * Internally, we're using an enum to specify the granularity. We make the - * compiler emit specialized code for each granularity. - */ -enum rmap_level { - RMAP_LEVEL_PTE = 0, - RMAP_LEVEL_PMD, - RMAP_LEVEL_PUD, -}; - -static inline void __folio_rmap_sanity_checks(const struct folio *folio, - const struct page *page, int nr_pages, enum rmap_level level) +static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio, + const struct page *page, int nr_pages, enum pgtable_level level) { /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); @@ -427,18 +417,18 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: break; - case RMAP_LEVEL_PMD: + case PGTABLE_LEVEL_PMD: /* * We don't support folios larger than a single PMD yet. So - * when RMAP_LEVEL_PMD is set, we assume that we are creating + * when PGTABLE_LEVEL_PMD is set, we assume that we are creating * a single "entire" mapping of the folio. */ VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); break; - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PUD: /* * Assume that we are creating a single "entire" mapping of the * folio. @@ -447,7 +437,7 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); break; default: - VM_WARN_ON_ONCE(true); + BUILD_BUG(); } /* @@ -567,14 +557,14 @@ static inline void hugetlb_remove_rmap(struct folio *folio) static __always_inline void __folio_dup_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, - enum rmap_level level) + enum pgtable_level level) { const int orig_nr_pages = nr_pages; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { atomic_inc(&folio->_mapcount); break; @@ -587,11 +577,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, } folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); folio_inc_large_mapcount(folio, dst_vma); break; + default: + BUILD_BUG(); } } @@ -609,13 +601,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, static inline void folio_dup_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE); } static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE); } /** @@ -632,7 +624,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE); #else WARN_ON_ONCE(true); #endif @@ -640,7 +632,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio, static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, enum rmap_level level) + struct vm_area_struct *src_vma, enum pgtable_level level) { const int orig_nr_pages = nr_pages; bool maybe_pinned; @@ -665,7 +657,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, * copying if the folio maybe pinned. */ switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (unlikely(maybe_pinned)) { for (i = 0; i < nr_pages; i++) if (PageAnonExclusive(page + i)) @@ -687,8 +679,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, } while (page++, --nr_pages > 0); folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: if (PageAnonExclusive(page)) { if (unlikely(maybe_pinned)) return -EBUSY; @@ -697,6 +689,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, atomic_inc(&folio->_entire_mapcount); folio_inc_large_mapcount(folio, dst_vma); break; + default: + BUILD_BUG(); } return 0; } @@ -730,7 +724,7 @@ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, struct vm_area_struct *src_vma) { return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, - src_vma, RMAP_LEVEL_PTE); + src_vma, PGTABLE_LEVEL_PTE); } static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, @@ -738,7 +732,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, struct vm_area_struct *src_vma) { return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, - RMAP_LEVEL_PTE); + PGTABLE_LEVEL_PTE); } /** @@ -770,7 +764,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, - src_vma, RMAP_LEVEL_PMD); + src_vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; @@ -778,7 +772,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, } static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level) + struct page *page, int nr_pages, enum pgtable_level level) { VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); @@ -873,7 +867,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, static inline int folio_try_share_anon_rmap_pte(struct folio *folio, struct page *page) { - return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); + return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE); } /** @@ -904,7 +898,7 @@ static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, - RMAP_LEVEL_PMD); + PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; diff --git a/mm/rmap.c b/mm/rmap.c index 84a8d8b02ef7..0e9c4041f868 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1265,7 +1265,7 @@ static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) static __always_inline void __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; @@ -1274,7 +1274,7 @@ static __always_inline void __folio_add_rmap(struct folio *folio, __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_inc_and_test(&folio->_mapcount); break; @@ -1300,11 +1300,11 @@ static __always_inline void __folio_add_rmap(struct folio *folio, folio_add_large_mapcount(folio, orig_nr_pages, vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { - if (level == RMAP_LEVEL_PMD && first) + if (level == PGTABLE_LEVEL_PMD && first) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_inc_return_large_mapcount(folio, vma); if (nr == 1) @@ -1323,7 +1323,7 @@ static __always_inline void __folio_add_rmap(struct folio *folio, * We only track PMD mappings of PMD-sized * folios separately. */ - if (level == RMAP_LEVEL_PMD) + if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ @@ -1336,6 +1336,8 @@ static __always_inline void __folio_add_rmap(struct folio *folio, } folio_inc_large_mapcount(folio, vma); break; + default: + BUILD_BUG(); } __folio_mod_stat(folio, nr, nr_pmdmapped); } @@ -1427,7 +1429,7 @@ static void __page_check_anon_rmap(const struct folio *folio, static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - unsigned long address, rmap_t flags, enum rmap_level level) + unsigned long address, rmap_t flags, enum pgtable_level level) { int i; @@ -1440,20 +1442,22 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, if (flags & RMAP_EXCLUSIVE) { switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: for (i = 0; i < nr_pages; i++) SetPageAnonExclusive(page + i); break; - case RMAP_LEVEL_PMD: + case PGTABLE_LEVEL_PMD: SetPageAnonExclusive(page); break; - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PUD: /* * Keep the compiler happy, we don't support anonymous * PUD mappings. */ WARN_ON_ONCE(1); break; + default: + BUILD_BUG(); } } @@ -1507,7 +1511,7 @@ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page, rmap_t flags) { __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags, - RMAP_LEVEL_PTE); + PGTABLE_LEVEL_PTE); } /** @@ -1528,7 +1532,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags, - RMAP_LEVEL_PMD); + PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1609,7 +1613,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); @@ -1634,7 +1638,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { - __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); + __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** @@ -1651,7 +1655,7 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); + __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1672,7 +1676,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); + __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif @@ -1680,7 +1684,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; int last = 0, nr = 0, nr_pmdmapped = 0; @@ -1689,7 +1693,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_add_negative(-1, &folio->_mapcount); break; @@ -1719,11 +1723,11 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && atomic_read(mapped); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { last = atomic_add_negative(-1, &folio->_entire_mapcount); - if (level == RMAP_LEVEL_PMD && last) + if (level == PGTABLE_LEVEL_PMD && last) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_dec_return_large_mapcount(folio, vma); if (!nr) { @@ -1743,7 +1747,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED)) { nr_pages = folio_large_nr_pages(folio); - if (level == RMAP_LEVEL_PMD) + if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ @@ -1757,6 +1761,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && nr < nr_pmdmapped; break; + default: + BUILD_BUG(); } /* @@ -1796,7 +1802,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, void folio_remove_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { - __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); + __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** @@ -1813,7 +1819,7 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); + __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1834,7 +1840,7 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page, { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); + __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif -- cgit v1.2.3 From ec63a44011dccebca24e7ef7e8a9521306de1bc9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:28 +0200 Subject: mm/memory: convert print_bad_pte() to print_bad_page_map() print_bad_pte() looks like something that should actually be a WARN or similar, but historically it apparently has proven to be useful to detect corruption of page tables even on production systems -- report the issue and keep the system running to make it easier to actually detect what is going wrong (e.g., multiple such messages might shed a light). As we want to unify vm_normal_page_*() handling for PTE/PMD/PUD, we'll have to take care of print_bad_pte() as well. Let's prepare for using print_bad_pte() also for non-PTEs by adjusting the implementation and renaming the function to print_bad_page_map(). Provide print_bad_pte() as a simple wrapper. Document the implicit locking requirements for the page table re-walk. To make the function a bit more readable, factor out the ratelimit check into is_bad_page_map_ratelimited() and place the printing of page table content into __print_bad_page_map_pgtable(). We'll now dump information from each level in a single line, and just stop the table walk once we hit something that is not a present page table. The report will now look something like (dumping pgd to pmd values): [ 77.943408] BUG: Bad page map in process XXX pte:80000001233f5867 [ 77.944077] addr:00007fd84bb1c000 vm_flags:08100071 anon_vma: ... [ 77.945186] pgd:10a89f067 p4d:10a89f067 pud:10e5a2067 pmd:105327067 Not using pgdp_get(), because that does not work properly on some arm configs where pgd_t is an array. Note that we are dumping all levels even when levels are folded for simplicity. [david@redhat.com: drop warning] Link: https://lkml.kernel.org/r/923b279c-de33-44dd-a923-2959afad8626@redhat.com Link: https://lkml.kernel.org/r/20250811112631.759341-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Oscar Salvador Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 18 +++++++++ mm/memory.c | 104 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 102 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 4f88e460eb9c..94249e671a7e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1983,6 +1983,24 @@ enum pgtable_level { PGTABLE_LEVEL_PGD, }; +static inline const char *pgtable_level_to_str(enum pgtable_level level) +{ + switch (level) { + case PGTABLE_LEVEL_PTE: + return "pte"; + case PGTABLE_LEVEL_PMD: + return "pmd"; + case PGTABLE_LEVEL_PUD: + return "pud"; + case PGTABLE_LEVEL_P4D: + return "p4d"; + case PGTABLE_LEVEL_PGD: + return "pgd"; + default: + return "unknown"; + } +} + #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) diff --git a/mm/memory.c b/mm/memory.c index 626caedce35e..dc0107354d37 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -491,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) add_mm_counter(mm, i, rss[i]); } -/* - * This function is called to print an error when a bad pte - * is found. For example, we might have a PFN-mapped pte in - * a region that doesn't allow it. - * - * The calling function must still handle the error. - */ -static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, struct page *page) +static bool is_bad_page_map_ratelimited(void) { - pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - p4d_t *p4d = p4d_offset(pgd, addr); - pud_t *pud = pud_offset(p4d, addr); - pmd_t *pmd = pmd_offset(pud, addr); - struct address_space *mapping; - pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; @@ -518,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; - return; + return true; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", @@ -529,15 +515,91 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; + return false; +} + +static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr) +{ + unsigned long long pgdv, p4dv, pudv, pmdv; + p4d_t p4d, *p4dp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pgd_t *pgdp; + + /* + * Although this looks like a fully lockless pgtable walk, it is not: + * see locking requirements for print_bad_page_map(). + */ + pgdp = pgd_offset(mm, addr); + pgdv = pgd_val(*pgdp); + + if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) { + pr_alert("pgd:%08llx\n", pgdv); + return; + } + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + p4dv = p4d_val(p4d); + + if (!p4d_present(p4d) || p4d_leaf(p4d)) { + pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv); + return; + } + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + pudv = pud_val(pud); + + if (!pud_present(pud) || pud_leaf(pud)) { + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv); + return; + } + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + pmdv = pmd_val(pmd); + + /* + * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE, + * because the table should already be mapped by the caller and + * doing another map would be bad. print_bad_page_map() should + * already take care of printing the PTE. + */ + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv, + p4dv, pudv, pmdv); +} + +/* + * This function is called to print an error when a bad page table entry (e.g., + * corrupted page table entry) is found. For example, we might have a + * PFN-mapped pte in a region that doesn't allow it. + * + * The calling function must still handle the error. + * + * This function must be called during a proper page table walk, as it will + * re-walk the page table to dump information: the caller MUST prevent page + * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf + * page table lock. + */ +static void print_bad_page_map(struct vm_area_struct *vma, + unsigned long addr, unsigned long long entry, struct page *page, + enum pgtable_level level) +{ + struct address_space *mapping; + pgoff_t index; + + if (is_bad_page_map_ratelimited()) + return; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm, + pgtable_level_to_str(level), entry); + __print_bad_page_map_pgtable(vma->vm_mm, addr); if (page) - dump_page(page, "bad pte"); + dump_page(page, "bad page map"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", @@ -549,6 +611,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +#define print_bad_pte(vma, addr, pte, page) \ + print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE) /* * vm_normal_page -- This function gets the "struct page" associated with a pte. -- cgit v1.2.3 From 2db308160b5a191b494746fd167dbbaaead3fb26 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:30 +0200 Subject: mm: introduce and use vm_normal_page_pud() Let's introduce vm_normal_page_pud(), which ends up being fairly simple because of our new common helpers and there not being a PUD-sized zero folio. Use vm_normal_page_pud() in folio_walk_start() to resolve a TODO, structuring the code like the other (pmd/pte) cases. Defer introducing vm_normal_folio_pud() until really used. Note that we can so far get PUDs with hugetlb, daxfs and PFNMAP entries. Link: https://lkml.kernel.org/r/20250811112631.759341-11-david@redhat.com Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/memory.c | 19 +++++++++++++++++++ mm/pagewalk.c | 20 ++++++++++---------- 3 files changed, 31 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b626d1bacef5..8ca7d2fa7134 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2360,6 +2360,8 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); +struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t pud); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); diff --git a/mm/memory.c b/mm/memory.c index 78af3f243cee..6f806bf3cc99 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -809,6 +809,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, return page_folio(page); return NULL; } + +/** + * vm_normal_page_pud() - Get the "struct page" associated with a PUD + * @vma: The VMA mapping the @pud. + * @addr: The address where the @pud is mapped. + * @pud: The PUD. + * + * Get the "struct page" associated with a PUD. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t pud) +{ + return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud), + pud_val(pud), PGTABLE_LEVEL_PUD); +} #endif /** diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 648038247a8d..c6753d370ff4 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -902,23 +902,23 @@ struct folio *folio_walk_start(struct folio_walk *fw, fw->pudp = pudp; fw->pud = pud; - /* - * TODO: FW_MIGRATION support for PUD migration entries - * once there are relevant users. - */ - if (!pud_present(pud) || pud_special(pud)) { + if (pud_none(pud)) { spin_unlock(ptl); goto not_found; - } else if (!pud_leaf(pud)) { + } else if (pud_present(pud) && !pud_leaf(pud)) { spin_unlock(ptl); goto pmd_table; + } else if (pud_present(pud)) { + page = vm_normal_page_pud(vma, addr, pud); + if (page) + goto found; } /* - * TODO: vm_normal_page_pud() will be handy once we want to - * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. + * TODO: FW_MIGRATION support for PUD migration entries + * once there are relevant users. */ - page = pud_page(pud); - goto found; + spin_unlock(ptl); + goto not_found; } pmd_table: -- cgit v1.2.3 From 4c89792ea0a224340ff198abc7caffa211baccd6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:31 +0200 Subject: mm: rename vm_ops->find_special_page() to vm_ops->find_normal_page() ... and hide it behind a kconfig option. There is really no need for any !xen code to perform this check. The naming is a bit off: we want to find the "normal" page when a PTE was marked "special". So it's really not "finding a special" page. Improve the documentation, and add a comment in the code where XEN ends up performing the pte_mkspecial() through a hypercall. More details can be found in commit 923b2919e2c3 ("xen/gntdev: mark userspace PTEs as special on x86 PV guests"). Link: https://lkml.kernel.org/r/20250811112631.759341-12-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Lorenzo Stoakes Reviewed-by: Wei Yang Cc: David Vrabel Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/xen/Kconfig | 1 + drivers/xen/gntdev.c | 5 +++-- include/linux/mm.h | 18 +++++++++++++----- mm/Kconfig | 2 ++ mm/memory.c | 12 ++++++++++-- tools/testing/vma/vma_internal.h | 18 +++++++++++++----- 6 files changed, 42 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 24f485827e03..f9a35ed266ec 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -138,6 +138,7 @@ config XEN_GNTDEV depends on XEN default m select MMU_NOTIFIER + select FIND_NORMAL_PAGE help Allows userspace processes to use grants. diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1f2160765618..26f13b37c78e 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -321,6 +321,7 @@ static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) BUG_ON(pgnr >= map->count); pte_maddr = arbitrary_virt_to_machine(pte).maddr; + /* Note: this will perform a pte_mkspecial() through the hypercall. */ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags, map->grants[pgnr].ref, map->grants[pgnr].domid); @@ -528,7 +529,7 @@ static void gntdev_vma_close(struct vm_area_struct *vma) gntdev_put_map(priv, map); } -static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma, +static struct page *gntdev_vma_find_normal_page(struct vm_area_struct *vma, unsigned long addr) { struct gntdev_grant_map *map = vma->vm_private_data; @@ -539,7 +540,7 @@ static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma, static const struct vm_operations_struct gntdev_vmops = { .open = gntdev_vma_open, .close = gntdev_vma_close, - .find_special_page = gntdev_vma_find_special_page, + .find_normal_page = gntdev_vma_find_normal_page, }; /* ------------------------------------------------------------------ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 8ca7d2fa7134..3868ca1a25f9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -657,13 +657,21 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif +#ifdef CONFIG_FIND_NORMAL_PAGE /* - * Called by vm_normal_page() for special PTEs to find the - * page for @addr. This is useful if the default behavior - * (using pte_page()) would not find the correct page. + * Called by vm_normal_page() for special PTEs in @vma at @addr. This + * allows for returning a "normal" page from vm_normal_page() even + * though the PTE indicates that the "struct page" either does not exist + * or should not be touched: "special". + * + * Do not add new users: this really only works when a "normal" page + * was mapped, but then the PTE got changed to something weird (+ + * marked special) that would not make pte_pfn() identify the originally + * inserted page. */ - struct page *(*find_special_page)(struct vm_area_struct *vma, - unsigned long addr); + struct page *(*find_normal_page)(struct vm_area_struct *vma, + unsigned long addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ }; #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..59a04d0b2e27 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1381,6 +1381,8 @@ config PT_RECLAIM Note: now only empty user PTE page table pages will be reclaimed. +config FIND_NORMAL_PAGE + def_bool n source "mm/damon/Kconfig" diff --git a/mm/memory.c b/mm/memory.c index 6f806bf3cc99..002c28795d8b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -639,6 +639,12 @@ static void print_bad_page_map(struct vm_area_struct *vma, * trivial. Secondly, an architecture may not have a spare page table * entry bit, which requires a more complicated scheme, described below. * + * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on + * page table entries that actually map "normal" pages: however, that page + * cannot be looked up through the PFN stored in the page table entry, but + * instead will be looked up through vm_ops->find_normal_page(). So far, this + * only applies to PTEs. + * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. @@ -679,8 +685,10 @@ static inline struct page *__vm_normal_page(struct vm_area_struct *vma, { if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (unlikely(special)) { - if (vma->vm_ops && vma->vm_ops->find_special_page) - return vma->vm_ops->find_special_page(vma, addr); +#ifdef CONFIG_FIND_NORMAL_PAGE + if (vma->vm_ops && vma->vm_ops->find_normal_page) + return vma->vm_ops->find_normal_page(vma, addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 3639aa8dd2b0..cb1c2a8afe26 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -467,13 +467,21 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif +#ifdef CONFIG_FIND_NORMAL_PAGE /* - * Called by vm_normal_page() for special PTEs to find the - * page for @addr. This is useful if the default behavior - * (using pte_page()) would not find the correct page. + * Called by vm_normal_page() for special PTEs in @vma at @addr. This + * allows for returning a "normal" page from vm_normal_page() even + * though the PTE indicates that the "struct page" either does not exist + * or should not be touched: "special". + * + * Do not add new users: this really only works when a "normal" page + * was mapped, but then the PTE got changed to something weird (+ + * marked special) that would not make pte_pfn() identify the originally + * inserted page. */ - struct page *(*find_special_page)(struct vm_area_struct *vma, - unsigned long addr); + struct page *(*find_normal_page)(struct vm_area_struct *vma, + unsigned long addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ }; struct vm_unmapped_area_info { -- cgit v1.2.3 From 2843408ca971d1472a6a8b32ee4647f55ecab598 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:10 +0200 Subject: mm: rename MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO As all the helper functions has been renamed from *_page to *_folio, rename the MM flag from MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO. No functional changes. Link: https://lkml.kernel.org/r/20250811084113.647267-3-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 +- mm/huge_memory.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3ed763e7ec6f..cf94df4955c7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1758,7 +1758,7 @@ enum { #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3f0c8c2856d3..2801ce9bbde9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -248,13 +248,13 @@ static void put_huge_zero_folio(void) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) return READ_ONCE(huge_zero_folio); if (!get_huge_zero_folio()) return NULL; - if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (test_and_set_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); @@ -262,7 +262,7 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) void mm_put_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) put_huge_zero_folio(); } -- cgit v1.2.3 From 2d8bd8049e89efe42a5397de4effd899e8dd2249 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:11 +0200 Subject: mm: add persistent huge zero folio Many places in the kernel need to zero out larger chunks, but the maximum segment that can be zeroed out at a time by ZERO_PAGE is limited by PAGE_SIZE. This is especially annoying in block devices and filesystems where multiple ZERO_PAGEs are attached to the bio in different bvecs. With multipage bvec support in block layer, it is much more efficient to send out larger zero pages as a part of single bvec. This concern was raised during the review of adding Large Block Size support to XFS[1][2]. Usually huge_zero_folio is allocated on demand, and it will be deallocated by the shrinker if there are no users of it left. At moment, huge_zero_folio infrastructure refcount is tied to the process lifetime that created it. This might not work for bio layer as the completions can be async and the process that created the huge_zero_folio might no longer be alive. And, one of the main points that came up during discussion is to have something bigger than zero page as a drop-in replacement. Add a config option PERSISTENT_HUGE_ZERO_FOLIO that will result in allocating the huge zero folio during early init and never free the memory by disabling the shrinker. This makes using the huge_zero_folio without having to pass any mm struct and does not tie the lifetime of the zero folio to anything, making it a drop-in replacement for ZERO_PAGE. If PERSISTENT_HUGE_ZERO_FOLIO config option is enabled, then mm_get_huge_zero_folio() will simply return the allocated page instead of dynamically allocating a new PMD page. Use this option carefully in resource constrained systems as it uses one full PMD sized page for zeroing purposes. [1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/ [2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/ Link: https://lkml.kernel.org/r/20250811084113.647267-4-kernel@pankajraghav.com Signed-off-by: David Hildenbrand Signed-off-by: Pankaj Raghav Reviewed-by: Lorenzo Stoakes Co-developed-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Zi Yan Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 16 ++++++++++++++++ mm/Kconfig | 16 ++++++++++++++++ mm/huge_memory.c | 40 ++++++++++++++++++++++++++++++---------- 3 files changed, 62 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7748489fde1b..bd547857c6c1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -495,6 +495,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); +static inline struct folio *get_persistent_huge_zero_folio(void) +{ + if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return NULL; + + if (unlikely(!huge_zero_folio)) + return NULL; + + return huge_zero_folio; +} + static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); @@ -685,6 +696,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, { return 0; } + +static inline struct folio *get_persistent_huge_zero_folio(void) +{ + return NULL; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/mm/Kconfig b/mm/Kconfig index 59a04d0b2e27..4108bcd96784 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -823,6 +823,22 @@ config ARCH_WANT_GENERAL_HUGETLB config ARCH_WANTS_THP_SWAP def_bool n +config PERSISTENT_HUGE_ZERO_FOLIO + bool "Allocate a PMD sized folio for zeroing" + depends on TRANSPARENT_HUGEPAGE + help + Enable this option to reduce the runtime refcounting overhead + of the huge zero folio and expand the places in the kernel + that can use huge zero folios. For instance, block I/O benefits + from access to large folios for zeroing memory. + + With this option enabled, the huge zero folio is allocated + once and never freed. One full huge page's worth of memory shall + be used. + + Say Y if your system has lots of memory. Say N if you are + memory constrained. + config MM_ID def_bool n diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2801ce9bbde9..b8bb078a1a34 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -248,6 +248,9 @@ static void put_huge_zero_folio(void) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return huge_zero_folio; + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) return READ_ONCE(huge_zero_folio); @@ -262,6 +265,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) void mm_put_huge_zero_folio(struct mm_struct *mm) { + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return; + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) put_huge_zero_folio(); } @@ -849,16 +855,34 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) static int __init thp_shrinker_init(void) { - huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); - if (!huge_zero_folio_shrinker) - return -ENOMEM; - deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | SHRINKER_NONSLAB, "thp-deferred_split"); - if (!deferred_split_shrinker) { - shrinker_free(huge_zero_folio_shrinker); + if (!deferred_split_shrinker) + return -ENOMEM; + + deferred_split_shrinker->count_objects = deferred_split_count; + deferred_split_shrinker->scan_objects = deferred_split_scan; + shrinker_register(deferred_split_shrinker); + + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) { + /* + * Bump the reference of the huge_zero_folio and do not + * initialize the shrinker. + * + * huge_zero_folio will always be NULL on failure. We assume + * that get_huge_zero_folio() will most likely not fail as + * thp_shrinker_init() is invoked early on during boot. + */ + if (!get_huge_zero_folio()) + pr_warn("Allocating persistent huge zero folio failed\n"); + return 0; + } + + huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); + if (!huge_zero_folio_shrinker) { + shrinker_free(deferred_split_shrinker); return -ENOMEM; } @@ -866,10 +890,6 @@ static int __init thp_shrinker_init(void) huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; shrinker_register(huge_zero_folio_shrinker); - deferred_split_shrinker->count_objects = deferred_split_count; - deferred_split_shrinker->scan_objects = deferred_split_scan; - shrinker_register(deferred_split_shrinker); - return 0; } -- cgit v1.2.3 From 415a0fd62f1899fe2bb81d661e427194b1c97201 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:12 +0200 Subject: mm: add largest_zero_folio() routine The callers of mm_get_huge_zero_folio() have access to a mm struct and the lifetime of the huge_zero_folio is tied to the lifetime of the mm struct. largest_zero_folio() will give access to huge_zero_folio when PERSISTENT_HUGE_ZERO_FOLIO config option is enabled for callers that do not want to tie the lifetime to a mm struct. This is very useful for filesystem and block layers where the request completions can be async and there is no guarantee on the mm struct lifetime. This function will return a ZERO_PAGE folio if PERSISTENT_HUGE_ZERO_FOLIO is disabled or if we failed to allocate a huge_zero_folio during early init. Link: https://lkml.kernel.org/r/20250811084113.647267-5-kernel@pankajraghav.com Signed-off-by: David Hildenbrand Signed-off-by: Pankaj Raghav Reviewed-by: Lorenzo Stoakes Co-developed-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Zi Yan Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bd547857c6c1..14d424830fa8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -714,4 +714,26 @@ static inline int split_folio_to_order(struct folio *folio, int new_order) return split_folio_to_list_to_order(folio, NULL, new_order); } +/** + * largest_zero_folio - Get the largest zero size folio available + * + * This function shall be used when mm_get_huge_zero_folio() cannot be + * used as there is no appropriate mm lifetime to tie the huge zero folio + * from the caller. + * + * Deduce the size of the folio with folio_size instead of assuming the + * folio size. + * + * Return: pointer to PMD sized zero folio if CONFIG_PERSISTENT_HUGE_ZERO_FOLIO + * is enabled or a single page sized zero folio + */ +static inline struct folio *largest_zero_folio(void) +{ + struct folio *folio = get_persistent_huge_zero_folio(); + + if (folio) + return folio; + + return page_folio(ZERO_PAGE(0)); +} #endif /* _LINUX_HUGE_MM_H */ -- cgit v1.2.3 From bb6525f2f8c41e89ba3fc506bc1705c68cf845ae Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:10 +0100 Subject: mm: add bitmap mm->flags field Patch series "mm: make mm->flags a bitmap and 64-bit on all arches". We are currently in the bizarre situation where we are constrained on the number of flags we can set in an mm_struct based on whether this is a 32-bit or 64-bit kernel. This is because mm->flags is an unsigned long field, which is 32-bits on a 32-bit system and 64-bits on a 64-bit system. In order to keep things functional across both architectures, we do not permit mm flag bits to be set above flag 31 (i.e. the 32nd bit). This is a silly situation, especially given how profligate we are in storing metadata in mm_struct, so let's convert mm->flags into a bitmap and allow ourselves as many bits as we like. In order to execute this change, we introduce a new opaque type - mm_flags_t - which wraps a bitmap. We go further and mark the bitmap field __private, which forces users to have to use accessors, which allows us to enforce atomicity rules around mm->flags (except on those occasions they are not required - fork, etc.) and makes it far easier to keep track of how mm flags are being utilised. In order to implement this change sensibly and an an iterative way, we start by introducing the type with the same bitsize as the current mm flags (system word size) and place it in union with mm->flags. We are then able to gradually update users as we go without being forced to do everything in a single patch. In the course of working on this series I noticed the MMF_* flag masks encounter a sign extension bug that, due to the 32-bit limit on mm->flags thus far, has not caused any issues in practice, but required fixing for this series. We must make special dispensation for two cases - coredump and initailisation on fork, but of which use masks extensively. Since coredump flags are set in stone, we can safely assume they will remain in the first 32-bits of the flags. We therefore provide special non-atomic accessors for this case that access the first system word of flags, keeping everything there essentially the same. For mm->flags initialisation on fork, we adjust the logic to ensure all bits are cleared correctly, and then adjust the existing intialisation logic, dubbing the implementation utilising flags as legacy. This means we get the same fast operations as we do now, but in future we can also choose to update the forking logic to additionally propagate flags beyond 32-bits across fork. With this change in place we can, in future, decide to have as many bits as we please. Since the size of the bitmap will scale in system word multiples, there should be no issues with changes in alignment in mm_struct. Additionally, the really sensitive field (mmap_lock) is located prior to the flags field so this should have no impact on that either. This patch (of 10): We are currently in the bizarre situation where we are constrained on the number of flags we can set in an mm_struct based on whether this is a 32-bit or 64-bit kernel. This is because mm->flags is an unsigned long field, which is 32-bits on a 32-bit system and 64-bits on a 64-bit system. In order to keep things functional across both architectures, we do not permit mm flag bits to be set above flag 31 (i.e. the 32nd bit). This is a silly situation, especially given how profligate we are in storing metadata in mm_struct, so let's convert mm->flags into a bitmap and allow ourselves as many bits as we like. To keep things manageable, firstly we introduce the bitmap at a system word system as a new field mm->_flags, in union. This means the new bitmap mm->_flags is bitwise exactly identical to the existing mm->flags field. We have an opportunity to also introduce some type safety here, so let's wrap the mm flags field as a struct and declare it as an mm_flags_t typedef to keep it consistent with vm_flags_t for VMAs. We make the internal field privately accessible, in order to force the use of helper functions so we can enforce that accesses are bitwise as required. We therefore introduce accessors prefixed with mm_flags_*() for callers to use. We place the bit parameter first so as to match the parameter ordering of the *_bit() functions. Having this temporary union arrangement allows us to incrementally swap over users of mm->flags patch-by-patch rather than having to do everything in one fell swoop. [lorenzo.stoakes@oracle.com: place __private in correct place, const-ify __mm_flags_get_word] Link: https://lkml.kernel.org/r/d4ba117d-6234-4069-b871-254d152d7d21@lucifer.local Link: https://lkml.kernel.org/r/cover.1755012943.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/9de8dfd9de8c95cd31622d6e52051ba0d1848f5a.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3868ca1a25f9..4ed4a0b9dad6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,6 +34,8 @@ #include #include #include +#include +#include struct mempolicy; struct anon_vma; @@ -720,6 +722,36 @@ static inline void assert_fault_locked(struct vm_fault *vmf) } #endif /* CONFIG_PER_VMA_LOCK */ +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) +{ + return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) +{ + return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_set(int flag, struct mm_struct *mm) +{ + set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_clear(int flag, struct mm_struct *mm) +{ + clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_clear_all(struct mm_struct *mm) +{ + bitmap_zero(ACCESS_PRIVATE(&mm->_flags, __mm_flags), NUM_MM_FLAG_BITS); +} + extern const struct vm_operations_struct vma_dummy_vm_ops; static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cf94df4955c7..0e001dbad455 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -927,6 +928,15 @@ struct mm_cid { }; #endif +/* + * Opaque type representing current mm_struct flag state. Must be accessed via + * mm_flags_xxx() helper functions. + */ +#define NUM_MM_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); +} __private mm_flags_t; + struct kioctx_table; struct iommu_mm_data; struct mm_struct { @@ -1109,7 +1119,11 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; - unsigned long flags; /* Must use atomic bitops to access */ + /* Temporary union while we convert users to mm_flags_t. */ + union { + unsigned long flags; /* Must use atomic bitops to access */ + mm_flags_t _flags; /* Must use mm_flags_* helpers to access */ + }; #ifdef CONFIG_AIO spinlock_t ioctx_lock; @@ -1219,6 +1233,28 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; +/* Set the first system word of mm flags, non-atomically. */ +static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + + bitmap_copy(bitmap, &value, BITS_PER_LONG); +} + +/* Obtain a read-only view of the bitmap. */ +static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) +{ + return (const unsigned long *)ACCESS_PRIVATE(&mm->_flags, __mm_flags); +} + +/* Read the first system word of mm flags, non-atomically. */ +static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) +{ + const unsigned long *bitmap = __mm_flags_get_bitmap(mm); + + return bitmap_read(bitmap, 0, BITS_PER_LONG); +} + #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; -- cgit v1.2.3 From 12e423ba4eaed7b1561b677d32e6599f932d03db Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:11 +0100 Subject: mm: convert core mm to mm_flags_*() accessors As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. This will result in the debug output being that of a bitmap output, which will result in a minor change here, but since this is for debug only, this should have no bearing. Otherwise, no functional changes intended. [akpm@linux-foundation.org: fix typo in comment]Link: https://lkml.kernel.org/r/1eb2266f4408798a55bda00cb04545a3203aa572.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 +- include/linux/khugepaged.h | 6 ++++-- include/linux/ksm.h | 6 +++--- include/linux/mm.h | 2 +- include/linux/mman.h | 2 +- include/linux/oom.h | 2 +- mm/debug.c | 4 ++-- mm/gup.c | 10 +++++----- mm/huge_memory.c | 8 ++++---- mm/khugepaged.c | 10 +++++----- mm/ksm.c | 32 ++++++++++++++++---------------- mm/mmap.c | 8 ++++---- mm/oom_kill.c | 26 +++++++++++++------------- mm/util.c | 6 +++--- 14 files changed, 63 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 14d424830fa8..84b7eebe0d68 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -327,7 +327,7 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, * example, s390 kvm. */ return (vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags); + mm_flags_test(MMF_DISABLE_THP, vma->vm_mm); } static inline bool thp_disabled_by_hw(void) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index ff6120463745..eb1946a70cff 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -2,6 +2,8 @@ #ifndef _LINUX_KHUGEPAGED_H #define _LINUX_KHUGEPAGED_H +#include + extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; @@ -20,13 +22,13 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) + if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm)) __khugepaged_enter(mm); } static inline void khugepaged_exit(struct mm_struct *mm) { - if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + if (mm_flags_test(MMF_VM_HUGEPAGE, mm)) __khugepaged_exit(mm); } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c17b955e7b0b..22e67ca7cba3 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -56,13 +56,13 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) + if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return __ksm_enter(mm); return 0; @@ -70,7 +70,7 @@ static inline int ksm_execve(struct mm_struct *mm) static inline void ksm_exit(struct mm_struct *mm) { - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGEABLE, mm)) __ksm_exit(mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ed4a0b9dad6..34311ebe62cc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1949,7 +1949,7 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); - if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) + if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)) return false; return folio_maybe_dma_pinned(folio); diff --git a/include/linux/mman.h b/include/linux/mman.h index de9e8e6229a4..0ba8a7e8b90a 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -201,7 +201,7 @@ static inline bool arch_memory_deny_write_exec_supported(void) static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ - if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) return false; /* If the new VMA is not executable, we have nothing to deny. */ diff --git a/include/linux/oom.h b/include/linux/oom.h index 1e0fc6931ce9..7b02bc1d0a7e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -91,7 +91,7 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) */ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) { - if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags))) + if (unlikely(mm_flags_test(MMF_UNSTABLE, mm))) return VM_FAULT_SIGBUS; return 0; } diff --git a/mm/debug.c b/mm/debug.c index b4388f4dcd4d..64ddb0c4b4be 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -182,7 +182,7 @@ void dump_mm(const struct mm_struct *mm) "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %px flags %lx\n" + "binfmt %px flags %*pb\n" #ifdef CONFIG_AIO "ioctx_table %px\n" #endif @@ -211,7 +211,7 @@ void dump_mm(const struct mm_struct *mm) mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, - mm->binfmt, mm->flags, + mm->binfmt, NUM_MM_FLAG_BITS, __mm_flags_get_bitmap(mm), #ifdef CONFIG_AIO mm->ioctx_table, #endif diff --git a/mm/gup.c b/mm/gup.c index adffe663594d..331d22bf7b2d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -475,10 +475,10 @@ EXPORT_SYMBOL_GPL(unpin_folios); * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ -static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) +static inline void mm_set_has_pinned_flag(struct mm_struct *mm) { - if (!test_bit(MMF_HAS_PINNED, mm_flags)) - set_bit(MMF_HAS_PINNED, mm_flags); + if (!mm_flags_test(MMF_HAS_PINNED, mm)) + mm_flags_set(MMF_HAS_PINNED, mm); } #ifdef CONFIG_MMU @@ -1693,7 +1693,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, mmap_assert_locked(mm); if (flags & FOLL_PIN) - mm_set_has_pinned_flag(&mm->flags); + mm_set_has_pinned_flag(mm); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior @@ -3210,7 +3210,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, return -EINVAL; if (gup_flags & FOLL_PIN) - mm_set_has_pinned_flag(¤t->mm->flags); + mm_set_has_pinned_flag(current->mm); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b8bb078a1a34..a2f476e7419a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -251,13 +251,13 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) return huge_zero_folio; - if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) return READ_ONCE(huge_zero_folio); if (!get_huge_zero_folio()) return NULL; - if (test_and_set_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm)) put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); @@ -268,7 +268,7 @@ void mm_put_huge_zero_folio(struct mm_struct *mm) if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) return; - if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) put_huge_zero_folio(); } @@ -1145,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, off_sub = (off - ret) & (size - 1); - if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) + if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub) return ret + size; ret += off_sub; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b40bdfd224c..550eb00116c5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - test_bit(MMF_DISABLE_THP, &mm->flags); + mm_flags_test(MMF_DISABLE_THP, mm); } static bool hugepage_pmd_enabled(void) @@ -445,7 +445,7 @@ void __khugepaged_enter(struct mm_struct *mm) /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; mm_slot = mm_slot_alloc(mm_slot_cache); @@ -472,7 +472,7 @@ void __khugepaged_enter(struct mm_struct *mm) void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) @@ -497,7 +497,7 @@ void __khugepaged_exit(struct mm_struct *mm) spin_unlock(&khugepaged_mm_lock); if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + mm_flags_clear(MMF_VM_HUGEPAGE, mm); mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { @@ -1459,7 +1459,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Not strictly needed because the mm exited already. * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ diff --git a/mm/ksm.c b/mm/ksm.c index 160787bb121c..2ef29802a49b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1217,8 +1217,8 @@ mm_exiting: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); @@ -2620,8 +2620,8 @@ no_vmas: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2742,7 +2742,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma) vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) && + if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && __ksm_should_add_vma(file, vm_flags)) vm_flags |= VM_MERGEABLE; @@ -2784,16 +2784,16 @@ int ksm_enable_merge_any(struct mm_struct *mm) { int err; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; } - set_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_set(MMF_VM_MERGE_ANY, mm); ksm_add_vmas(mm); return 0; @@ -2815,7 +2815,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) { int err; - if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; err = ksm_del_vmas(mm); @@ -2824,7 +2824,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) return err; } - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); return 0; } @@ -2832,9 +2832,9 @@ int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) return 0; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } @@ -2852,7 +2852,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (!vma_ksm_compatible(vma)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; @@ -2912,7 +2912,7 @@ int __ksm_enter(struct mm_struct *mm) list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - set_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_set(MMF_VM_MERGEABLE, mm); mmgrab(mm); if (needs_wakeup) @@ -2954,8 +2954,8 @@ void __ksm_exit(struct mm_struct *mm) if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); + mm_flags_clear(MMF_VM_MERGEABLE, mm); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); diff --git a/mm/mmap.c b/mm/mmap.c index 7306253cc3b5..7a057e0e8da9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -802,7 +802,7 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { - if (test_bit(MMF_TOPDOWN, &mm->flags)) + if (mm_flags_test(MMF_TOPDOWN, mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -1284,7 +1284,7 @@ void exit_mmap(struct mm_struct *mm) * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); @@ -1859,14 +1859,14 @@ loop_out: mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); mas_store(&vmi.mas, XA_ZERO_ENTRY); /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); } /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is * not fully initialised. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); } out: mmap_write_unlock(mm); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 25923cfec9c6..17650f0b516e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/oom_kill.c - * + * * Copyright (C) 1998,2000 Rik van Riel * Thanks go out to Claus Fischer for some serious inspiration and * for goading me into coding this file... @@ -218,7 +218,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_SKIP, &p->mm->flags) || + mm_flags_test(MMF_OOM_SKIP, p->mm) || in_vfork(p)) { task_unlock(p); return LONG_MIN; @@ -325,7 +325,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * any memory is quite low. */ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, task->signal->oom_mm)) goto next; goto abort; } @@ -524,7 +524,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) * should imply barriers already and the reader would hit a page fault * if it stumbled over a reaped memory. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); for_each_vma(vmi, vma) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) @@ -583,7 +583,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * under mmap_lock for reading because it serializes against the * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap(). */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { trace_skip_task_reaping(tsk->pid); goto out_unlock; } @@ -619,7 +619,7 @@ static void oom_reap_task(struct task_struct *tsk) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || - test_bit(MMF_OOM_SKIP, &mm->flags)) + mm_flags_test(MMF_OOM_SKIP, mm)) goto done; pr_info("oom_reaper: unable to reap pid:%d (%s)\n", @@ -634,7 +634,7 @@ done: * Hide this mm from OOM killer because it has been either reaped or * somebody can't call mmap_write_unlock(mm). */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); @@ -670,7 +670,7 @@ static void wake_oom_reaper(struct timer_list *timer) unsigned long flags; /* The victim managed to terminate on its own - see exit_mmap */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { put_task_struct(tsk); return; } @@ -695,7 +695,7 @@ static void wake_oom_reaper(struct timer_list *timer) static void queue_oom_reaper(struct task_struct *tsk) { /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + if (mm_flags_test_and_set(MMF_OOM_REAP_QUEUED, tsk->signal->oom_mm)) return; get_task_struct(tsk); @@ -892,7 +892,7 @@ static bool task_will_free_mem(struct task_struct *task) * This task has already been drained by the oom reaper so there are * only small chances it will free some more */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, mm)) return false; if (atomic_read(&mm->mm_users) <= 1) @@ -977,7 +977,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) continue; if (is_global_init(p)) { can_oom_reap = false; - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); @@ -1235,7 +1235,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) reap = true; else { /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + if (!mm_flags_test(MMF_OOM_SKIP, mm)) ret = -EINVAL; } task_unlock(p); @@ -1251,7 +1251,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure * possible change in exit_mmap is seen */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) + if (mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); diff --git a/mm/util.c b/mm/util.c index f814e6a59ab1..d235b74f7aff 100644 --- a/mm/util.c +++ b/mm/util.c @@ -471,17 +471,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } #endif #ifdef CONFIG_MMU -- cgit v1.2.3 From 39f8049cd49f7e88f89a33f97f996c7306e8be0b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:15 +0100 Subject: mm: update coredump logic to correctly use bitmap mm flags The coredump logic is slightly different from other users in that it both stores mm flags and additionally sets and gets using masks. Since the MMF_DUMPABLE_* flags must remain as they are for uABI reasons, and of course these are within the first 32-bits of the flags, it is reasonable to provide access to these in the same fashion so this logic can all still keep working as it has been. Therefore, introduce coredump-specific helpers __mm_flags_get_dumpable() and __mm_flags_set_mask_dumpable() for this purpose, and update all core dump users of mm flags to use these. [lorenzo.stoakes@oracle.com: abstract set_mask_bits() invocation to mm_types.h to satisfy ARC] Link: https://lkml.kernel.org/r/0e7ad263-1ff7-446d-81fe-97cff9c0e7ed@lucifer.local Link: https://lkml.kernel.org/r/2a5075f7e3c5b367d988178c79a3063d12ee53a9.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Christian Brauner Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/coredump.c | 4 +++- fs/exec.c | 2 +- fs/pidfs.c | 7 +++++-- fs/proc/base.c | 8 +++++--- include/linux/mm_types.h | 12 ++++++++++++ include/linux/sched/coredump.h | 18 +++++++++++++++++- 6 files changed, 43 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/coredump.c b/fs/coredump.c index 5dce257c67fc..f9d82ffc4b88 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1103,8 +1103,10 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) * We must use the same mm->flags while dumping core to avoid * inconsistency of bit flags, since this flag is not protected * by any locks. + * + * Note that we only care about MMF_DUMP* flags. */ - .mm_flags = mm->flags, + .mm_flags = __mm_flags_get_dumpable(mm), .vma_meta = NULL, .cpu = raw_smp_processor_id(), }; diff --git a/fs/exec.c b/fs/exec.c index 2a1e5e4042a1..dbac0e84cc3e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1999,7 +1999,7 @@ void set_dumpable(struct mm_struct *mm, int value) if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; - set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); + __mm_flags_set_mask_dumpable(mm, value); } SYSCALL_DEFINE3(execve, diff --git a/fs/pidfs.c b/fs/pidfs.c index 108e7527f837..9913c5268fef 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -357,8 +357,11 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) { task_lock(task); - if (task->mm) - kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags); + if (task->mm) { + unsigned long flags = __mm_flags_get_dumpable(task->mm); + + kinfo.coredump_mask = pidfs_coredump_mask(flags); + } task_unlock(task); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 62d35631ba8c..f0c093c58aaf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2962,8 +2962,10 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, ret = 0; mm = get_task_mm(task); if (mm) { + unsigned long flags = __mm_flags_get_dumpable(mm); + len = snprintf(buffer, sizeof(buffer), "%08lx\n", - ((mm->flags & MMF_DUMP_FILTER_MASK) >> + ((flags & MMF_DUMP_FILTER_MASK) >> MMF_DUMP_FILTER_SHIFT)); mmput(mm); ret = simple_read_from_buffer(buf, count, ppos, buffer, len); @@ -3002,9 +3004,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) - set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm); else - clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm); } mmput(mm); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e001dbad455..9d224075d895 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1255,6 +1255,18 @@ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) return bitmap_read(bitmap, 0, BITS_PER_LONG); } +/* + * Update the first system word of mm flags ONLY, applying the specified mask to + * it, then setting all flags specified by bits. + */ +static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, + unsigned long mask, unsigned long bits) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + + set_mask_bits(bitmap, mask, bits); +} + #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 6eb65ceed213..b7fafe999073 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,6 +8,20 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ +static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm) +{ + /* + * By convention, dumpable bits are contained in first 32 bits of the + * bitmap, so we can simply access this first unsigned long directly. + */ + return __mm_flags_get_word(mm); +} + +static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) +{ + __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value); +} + extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things @@ -22,7 +36,9 @@ static inline int __get_dumpable(unsigned long mm_flags) static inline int get_dumpable(struct mm_struct *mm) { - return __get_dumpable(mm->flags); + unsigned long flags = __mm_flags_get_dumpable(mm); + + return __get_dumpable(flags); } #endif /* _LINUX_SCHED_COREDUMP_H */ -- cgit v1.2.3 From 01f86753a05a3b971d69147d6d074c1a8d29b57d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:16 +0100 Subject: mm: correct sign-extension issue in MMF_* flag masks There is an issue with the mask declarations in linux/mm_types.h, which naively do (1 << bit) operations. Unfortunately this results in the 1 being defaulted as a signed (32-bit) integer. When the compiler expands the MMF_INIT_MASK bitmask it comes up with: (((1 << 2) - 1) | (((1 << 9) - 1) << 2) | (1 << 24) | (1 << 28) | (1 << 30) | (1 << 31)) Which overflows the signed integer to -788,527,105. Implicitly casting this to an unsigned integer results in sign-expansion, and thus this value becomes 0xffffffffd10007ff, rather than the intended 0xd10007ff. While we're limited to a maximum of 32 bits in mm->flags, this isn't an issue as the remaining bits being masked will always be zero. However, now we are moving towards having more bits in this flag, this becomes an issue. Simply resolve this by using the _BITUL() helper to cast the shifted value to an unsigned long. [lorenzo.stoakes@oracle.com: prefer BIT() to _BITUL()] Link: https://lkml.kernel.org/r/a0290c77-cd88-46d6-8d9a-073be7600d88@lucifer.local Link: https://lkml.kernel.org/r/f92194bee8c92a04fd4c9b2c14c7e65229639300.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9d224075d895..de09ae2a0de6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1767,7 +1767,7 @@ enum { * the modes are SUID_DUMP_* defined in linux/sched/coredump.h */ #define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -1782,13 +1782,13 @@ enum { #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS #define MMF_DUMP_FILTER_BITS 9 #define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) + ((BIT(MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + (BIT(MMF_DUMP_ANON_PRIVATE) | BIT(MMF_DUMP_ANON_SHARED) | \ + BIT(MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +# define MMF_DUMP_MASK_DEFAULT_ELF BIT(MMF_DUMP_ELF_HEADERS) #else # define MMF_DUMP_MASK_DEFAULT_ELF 0 #endif @@ -1808,7 +1808,7 @@ enum { #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) +#define MMF_DISABLE_THP_MASK BIT(MMF_DISABLE_THP) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* @@ -1821,16 +1821,15 @@ enum { #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_HAS_MDWE 28 -#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) - +#define MMF_HAS_MDWE_MASK BIT(MMF_HAS_MDWE) #define MMF_HAS_MDWE_NO_INHERIT 29 #define MMF_VM_MERGE_ANY 30 -#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) +#define MMF_VM_MERGE_ANY_MASK BIT(MMF_VM_MERGE_ANY) #define MMF_TOPDOWN 31 /* mm searches top down by default */ -#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) +#define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ -- cgit v1.2.3 From 19148a19da86f1b7d1a1b067c9f656b0f3a60fb1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:17 +0100 Subject: mm: update fork mm->flags initialisation to use bitmap We now need to account for flag initialisation on fork. We retain the existing logic as much as we can, but dub the existing flag mask legacy. These flags are therefore required to fit in the first 32-bits of the flags field. However, further flag propagation upon fork can be implemented in mm_init() on a per-flag basis. We ensure we clear the entire bitmap prior to setting it, and use __mm_flags_get_word() and __mm_flags_set_word() to manipulate these legacy fields efficiently. Link: https://lkml.kernel.org/r/9fb8954a7a0f0184f012a8e66f8565bcbab014ba.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 13 ++++++++++--- kernel/fork.c | 7 +++++-- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index de09ae2a0de6..69ce407b4343 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1831,16 +1831,23 @@ enum { #define MMF_TOPDOWN 31 /* mm searches top down by default */ #define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ +#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) -static inline unsigned long mmf_init_flags(unsigned long flags) +/* Legacy flags must fit within 32 bits. */ +static_assert((u64)MMF_INIT_LEGACY_MASK <= (u64)UINT_MAX); + +/* + * Initialise legacy flags according to masks, propagating selected flags on + * fork. Further flag manipulation can be performed by the caller. + */ +static inline unsigned long mmf_init_legacy_flags(unsigned long flags) { if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) flags &= ~((1UL << MMF_HAS_MDWE) | (1UL << MMF_HAS_MDWE_NO_INHERIT)); - return flags & MMF_INIT_MASK; + return flags & MMF_INIT_LEGACY_MASK; } #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index af673856499d..b04ecba4a709 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1057,11 +1057,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_uprobes_state(mm); hugetlb_count_init(mm); + mm_flags_clear_all(mm); if (current->mm) { - mm->flags = mmf_init_flags(current->mm->flags); + unsigned long flags = __mm_flags_get_word(current->mm); + + __mm_flags_set_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - mm->flags = default_dump_filter; + __mm_flags_set_word(mm, default_dump_filter); mm->def_flags = 0; } -- cgit v1.2.3 From 8166353fb8841390bf3ffe1b923a5ddeb348e4f7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:19 +0100 Subject: mm: replace mm->flags with bitmap entirely and set to 64 bits Now we have updated all users of mm->flags to use the bitmap accessors, repalce it with the bitmap version entirely. We are then able to move to having 64 bits of mm->flags on both 32-bit and 64-bit architectures. We also update the VMA userland tests to ensure that everything remains functional there. No functional changes intended, other than there now being 64 bits of available mm_struct flags. Link: https://lkml.kernel.org/r/e1f6654e016d36c43959764b01355736c5cbcdf8.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++------ include/linux/mm_types.h | 14 +++++--------- tools/testing/vma/vma_internal.h | 19 +++++++++++++++++-- 3 files changed, 28 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 34311ebe62cc..b61e2d4858cf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -724,32 +724,32 @@ static inline void assert_fault_locked(struct vm_fault *vmf) static inline bool mm_flags_test(int flag, const struct mm_struct *mm) { - return test_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) { - return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) { - return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_set(int flag, struct mm_struct *mm) { - set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_clear(int flag, struct mm_struct *mm) { - clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_clear_all(struct mm_struct *mm) { - bitmap_zero(ACCESS_PRIVATE(&mm->_flags, __mm_flags), NUM_MM_FLAG_BITS); + bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS); } extern const struct vm_operations_struct vma_dummy_vm_ops; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 69ce407b4343..05475b5fd516 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -932,7 +932,7 @@ struct mm_cid { * Opaque type representing current mm_struct flag state. Must be accessed via * mm_flags_xxx() helper functions. */ -#define NUM_MM_FLAG_BITS BITS_PER_LONG +#define NUM_MM_FLAG_BITS (64) typedef struct { DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } __private mm_flags_t; @@ -1119,11 +1119,7 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; - /* Temporary union while we convert users to mm_flags_t. */ - union { - unsigned long flags; /* Must use atomic bitops to access */ - mm_flags_t _flags; /* Must use mm_flags_* helpers to access */ - }; + mm_flags_t flags; /* Must use mm_flags_* hlpers to access */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; @@ -1236,7 +1232,7 @@ struct mm_struct { /* Set the first system word of mm flags, non-atomically. */ static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); bitmap_copy(bitmap, &value, BITS_PER_LONG); } @@ -1244,7 +1240,7 @@ static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value /* Obtain a read-only view of the bitmap. */ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) { - return (const unsigned long *)ACCESS_PRIVATE(&mm->_flags, __mm_flags); + return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); } /* Read the first system word of mm flags, non-atomically. */ @@ -1262,7 +1258,7 @@ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, unsigned long mask, unsigned long bits) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); set_mask_bits(bitmap, mask, bits); } diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index cb1c2a8afe26..f13354bf0a1e 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -249,6 +249,14 @@ struct mutex {}; #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = {} +#define DECLARE_BITMAP(name, bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#define NUM_MM_FLAG_BITS (64) +typedef struct { + __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); +} mm_flags_t; + struct mm_struct { struct maple_tree mm_mt; int map_count; /* number of VMAs */ @@ -260,7 +268,7 @@ struct mm_struct { unsigned long def_flags; - unsigned long flags; /* Must use atomic bitops to access */ + mm_flags_t flags; /* Must use mm_flags_* helpers to access */ }; struct vm_area_struct; @@ -1333,6 +1341,13 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } +# define ACCESS_PRIVATE(p, member) ((p)->member) + +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + /* * Denies creating a writable executable mapping or gaining executable permissions. * @@ -1363,7 +1378,7 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ - if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE, current->mm)) return false; /* If the new VMA is not executable, we have nothing to deny. */ -- cgit v1.2.3 From 0f9ab62a6e44ef51ea3e7f1c552b447cf4eb20ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Aug 2025 10:30:08 +0200 Subject: mempool: rename struct mempool_s to struct mempool Drop the pointless _s prefix and align to the usual struct naming to prepare for actually using the struct instead of the typedef so that random headers don't need to include mempool.h for just having a pointer to the mempool. Link: https://lkml.kernel.org/r/20250812083105.371295-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Harry Yoo Reviewed-by: Vlastimil Babka Cc: Christoph Lameter (Ampere) Cc: David Rientjes Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/blkdev.h | 2 +- include/linux/mempool.h | 2 +- include/linux/netfs.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fe1797bbec42..28ceaeffc0c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -199,7 +199,7 @@ struct gendisk { unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; spinlock_t zone_wplugs_lock; - struct mempool_s *zone_wplugs_pool; + struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 7b151441341b..34941a4b9026 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -15,7 +15,7 @@ struct kmem_cache; typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data); typedef void (mempool_free_t)(void *element, void *pool_data); -typedef struct mempool_s { +typedef struct mempool { spinlock_t lock; int min_nr; /* nr of elements at *elements */ int curr_nr; /* Current nr of elements at *elements */ diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 98c96d649bf9..72ee7d210a74 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -21,7 +21,7 @@ #include enum netfs_sreq_ref_trace; -typedef struct mempool_s mempool_t; +typedef struct mempool mempool_t; struct folio_queue; /** -- cgit v1.2.3 From 85b8cec15034e07500a6e5b8a5aea8185a3d775a Mon Sep 17 00:00:00 2001 From: Chris Li Date: Tue, 12 Aug 2025 00:10:59 -0700 Subject: mm: swap.h: Remove deleted field from comments The comment for struct swap_info_struct.lock incorrectly mentions fields that have already been deleted from the structure. Update the comments to accurately reflect the current struct swap_info_struct. There is no functional change. Link: https://lkml.kernel.org/r/20250812-swap-scan-list-v3-2-6d73504d267b@kernel.org Signed-off-by: Chris Li Reviewed-by: Kairui Song Acked-by: Nhat Pham Reviewed-by: Barry Song Cc: Baoquan He Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton --- include/linux/swap.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index a060d102e0d1..c2da85cb7fe7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -320,11 +320,8 @@ struct swap_info_struct { struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like - * swap_map, lowest_bit, highest_bit, - * inuse_pages, cluster_next, - * cluster_nr, lowest_alloc, - * highest_alloc, free/discard cluster - * list. other fields are only changed + * swap_map, inuse_pages and all cluster + * lists. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If -- cgit v1.2.3 From ec45783fce52f358c9e8680d2837bc0d477f16ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Aug 2025 16:57:55 +0200 Subject: memcg: optimize exit to user space memcg uses TIF_NOTIFY_RESUME to handle reclaiming on exit to user space. TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is utilized by other entities as well. This results in a unconditional mem_cgroup_handle_over_high() call for every invocation of resume_user_mode_work(), which is a pointless exercise as most of the time there is no reclaim work to do. Especially since RSEQ is used by glibc, TIF_NOTIFY_RESUME is raised quite frequently and the empty calls show up in exit path profiling. Optimize this by doing a quick check of the reclaim condition before invoking it. [akpm@linux-foundation.org: remove now-unneeded test of memcg_nr_pages_over_high==0, per Shakeel] Link: https://lkml.kernel.org/r/87tt2b6zgs.ffs@tglx Signed-off-by: Thomas Gleixner Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Michal Hocko Cc: Muchun Song Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 +++++++- mm/memcontrol.c | 7 ++----- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..9fa3afc90dd5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -900,7 +900,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } -void mem_cgroup_handle_over_high(gfp_t gfp_mask); +void __mem_cgroup_handle_over_high(gfp_t gfp_mask); + +static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) +{ + if (unlikely(current->memcg_nr_pages_over_high)) + __mem_cgroup_handle_over_high(gfp_mask); +} unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dd7fbed5a94..9712a751690f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2203,7 +2203,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * try_charge() (context permitting), as well as from the userland * return path where reclaim is always able to block. */ -void mem_cgroup_handle_over_high(gfp_t gfp_mask) +void __mem_cgroup_handle_over_high(gfp_t gfp_mask) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2213,9 +2213,6 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) struct mem_cgroup *memcg; bool in_retry = false; - if (likely(!nr_pages)) - return; - memcg = get_mem_cgroup_from_mm(current->mm); current->memcg_nr_pages_over_high = 0; @@ -2486,7 +2483,7 @@ done_restock: if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && gfpflags_allow_blocking(gfp_mask)) - mem_cgroup_handle_over_high(gfp_mask); + __mem_cgroup_handle_over_high(gfp_mask); return 0; } -- cgit v1.2.3 From 9dc21bbd62edeae6f63e6f25e1edb7167452457b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:53 +0100 Subject: prctl: extend PR_SET_THP_DISABLE to optionally exclude VM_HUGEPAGE Patch series "prctl: extend PR_SET_THP_DISABLE to only provide THPs when advised", v5. This will allow individual processes to opt-out of THP = "always" into THP = "madvise", without affecting other workloads on the system. This has been extensively discussed on the mailing list and has been summarized very well by David in the first patch which also includes the links to alternatives, please refer to the first patch commit message for the motivation for this series. Patch 1 adds the PR_THP_DISABLE_EXCEPT_ADVISED flag to implement this, along with the MMF changes. Patch 2 is a cleanup patch for tva_flags that will allow the forced collapse case to be transmitted to vma_thp_disabled (which is done in patch 3). Patch 4 adds documentation for PR_SET_THP_DISABLE/PR_GET_THP_DISABLE. Patches 6-7 implement the selftests for PR_SET_THP_DISABLE for completely disabling THPs (old behaviour) and only enabling it at advise (PR_THP_DISABLE_EXCEPT_ADVISED). This patch (of 7): People want to make use of more THPs, for example, moving from the "never" system policy to "madvise", or from "madvise" to "always". While this is great news for every THP desperately waiting to get allocated out there, apparently there are some workloads that require a bit of care during that transition: individual processes may need to opt-out from this behavior for various reasons, and this should be permitted without needing to make all other workloads on the system similarly opt-out. The following scenarios are imaginable: (1) Switch from "none" system policy to "madvise"/"always", but keep THPs disabled for selected workloads. (2) Stay at "none" system policy, but enable THPs for selected workloads, making only these workloads use the "madvise" or "always" policy. (3) Switch from "madvise" system policy to "always", but keep the "madvise" policy for selected workloads: allocate THPs only when advised. (4) Stay at "madvise" system policy, but enable THPs even when not advised for selected workloads -- "always" policy. Once can emulate (2) through (1), by setting the system policy to "madvise"/"always" while disabling THPs for all processes that don't want THPs. It requires configuring all workloads, but that is a user-space problem to sort out. (4) can be emulated through (3) in a similar way. Back when (1) was relevant in the past, as people started enabling THPs, we added PR_SET_THP_DISABLE, so relevant workloads that were not ready yet (i.e., used by Redis) were able to just disable THPs completely. Redis still implements the option to use this interface to disable THPs completely. With PR_SET_THP_DISABLE, we added a way to force-disable THPs for a workload -- a process, including fork+exec'ed process hierarchy. That essentially made us support (1): simply disable THPs for all workloads that are not ready for THPs yet, while still enabling THPs system-wide. The quest for handling (3) and (4) started, but current approaches (completely new prctl, options to set other policies per process, alternatives to prctl -- mctrl, cgroup handling) don't look particularly promising. Likely, the future will use bpf or something similar to implement better policies, in particular to also make better decisions about THP sizes to use, but this will certainly take a while as that work just started. Long story short: a simple enable/disable is not really suitable for the future, so we're not willing to add completely new toggles. While we could emulate (3)+(4) through (1)+(2) by simply disabling THPs completely for these processes, this is a step backwards, because these processes can no longer allocate THPs in regions where THPs were explicitly advised: regions flagged as VM_HUGEPAGE. Apparently, that imposes a problem for relevant workloads, because "not THPs" is certainly worse than "THPs only when advised". Could we simply relax PR_SET_THP_DISABLE, to "disable THPs unless not explicitly advised by the app through MAD_HUGEPAGE"? *maybe*, but this would change the documented semantics quite a bit, and the versatility to use it for debugging purposes, so I am not 100% sure that is what we want -- although it would certainly be much easier. So instead, as an easy way forward for (3) and (4), add an option to make PR_SET_THP_DISABLE disable *less* THPs for a process. In essence, this patch: (A) Adds PR_THP_DISABLE_EXCEPT_ADVISED, to be used as a flag in arg3 of prctl(PR_SET_THP_DISABLE) when disabling THPs (arg2 != 0). prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED). (B) Makes prctl(PR_GET_THP_DISABLE) return 3 if PR_THP_DISABLE_EXCEPT_ADVISED was set while disabling. Previously, it would return 1 if THPs were disabled completely. Now it returns the set flags as well: 3 if PR_THP_DISABLE_EXCEPT_ADVISED was set. (C) Renames MMF_DISABLE_THP to MMF_DISABLE_THP_COMPLETELY, to express the semantics clearly. Fortunately, there are only two instances outside of prctl() code. (D) Adds MMF_DISABLE_THP_EXCEPT_ADVISED to express "no THP except for VMAs with VM_HUGEPAGE" -- essentially "thp=madvise" behavior Fortunately, we only have to extend vma_thp_disabled(). (E) Indicates "THP_enabled: 0" in /proc/pid/status only if THPs are disabled completely Only indicating that THPs are disabled when they are really disabled completely, not only partially. For now, we don't add another interface to obtained whether THPs are disabled partially (PR_THP_DISABLE_EXCEPT_ADVISED was set). If ever required, we could add a new entry. The documented semantics in the man page for PR_SET_THP_DISABLE "is inherited by a child created via fork(2) and is preserved across execve(2)" is maintained. This behavior, for example, allows for disabling THPs for a workload through the launching process (e.g., systemd where we fork() a helper process to then exec()). For now, MADV_COLLAPSE will *fail* in regions without VM_HUGEPAGE and VM_NOHUGEPAGE. As MADV_COLLAPSE is a clear advise that user space thinks a THP is a good idea, we'll enable that separately next (requiring a bit of cleanup first). There is currently not way to prevent that a process will not issue PR_SET_THP_DISABLE itself to re-enable THP. There are not really known users for re-enabling it, and it's against the purpose of the original interface. So if ever required, we could investigate just forbidding to re-enable them, or make this somehow configurable. Link: https://lkml.kernel.org/r/20250815135549.130506-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20250815135549.130506-2-usamaarif642@gmail.com Acked-by: Zi Yan Acked-by: Usama Arif Tested-by: Usama Arif Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Usama Arif Cc: Arnd Bergmann Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 5 ++-- fs/proc/array.c | 2 +- include/linux/huge_mm.h | 20 +++++++++---- include/linux/mm_types.h | 13 ++++----- include/uapi/linux/prctl.h | 10 +++++++ kernel/sys.c | 59 ++++++++++++++++++++++++++++++-------- mm/khugepaged.c | 2 +- 7 files changed, 82 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 2971551b7235..915a3e44bc12 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -291,8 +291,9 @@ It's slow but very precise. HugetlbPages size of hugetlb memory portions CoreDumping process's memory is currently being dumped (killing the process may lead to a corrupted core) - THP_enabled process is allowed to use THP (returns 0 when - PR_SET_THP_DISABLE is set on the process + THP_enabled process is allowed to use THP (returns 0 when + PR_SET_THP_DISABLE is set on the process to disable + THP completely, not just partially) Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread diff --git a/fs/proc/array.c b/fs/proc/array.c index c286dc12325e..d84b291dd1ed 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -422,7 +422,7 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); if (thp_enabled) - thp_enabled = !mm_flags_test(MMF_DISABLE_THP, mm); + thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 84b7eebe0d68..22b8b067b295 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -318,16 +318,26 @@ struct thpsize { (transparent_hugepage_flags & \ (1<vm_mm)) + return true; /* - * Explicitly disabled through madvise or prctl, or some - * architectures may disable THP for some mappings, for - * example, s390 kvm. + * Are THPs disabled only for VMAs where we didn't get an explicit + * advise to use them? */ - return (vm_flags & VM_NOHUGEPAGE) || - mm_flags_test(MMF_DISABLE_THP, vma->vm_mm); + if (vm_flags & VM_HUGEPAGE) + return false; + return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } static inline bool thp_disabled_by_hw(void) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 05475b5fd516..d247da2fdb52 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1792,19 +1792,16 @@ enum { #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +#define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK BIT(MMF_DISABLE_THP) +#define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */ +#define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \ + BIT(MMF_DISABLE_THP_EXCEPT_ADVISED)) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index ed3aed264aeb..150b6deebfb1 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -177,7 +177,17 @@ struct prctl_mm_map { #define PR_GET_TID_ADDRESS 40 +/* + * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0 + * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively + * return "1" when no flags were specified for PR_SET_THP_DISABLE. + */ #define PR_SET_THP_DISABLE 41 +/* + * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / + * VM_HUGEPAGE). + */ +# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 /* diff --git a/kernel/sys.c b/kernel/sys.c index 605f7fe9a143..a46d9b75880b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2452,6 +2452,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len) return sizeof(mm->saved_auxv); } +static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + /* If disabled, we return "1 | flags", otherwise 0. */ + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm)) + return 1; + else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm)) + return 1 | PR_THP_DISABLE_EXCEPT_ADVISED; + return 0; +} + +static int prctl_set_thp_disable(bool thp_disable, unsigned long flags, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg4 || arg5) + return -EINVAL; + + /* Flags are only allowed when disabling. */ + if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED)) + return -EINVAL; + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + if (thp_disable) { + if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } else { + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + } else { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + mmap_write_unlock(current->mm); + return 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2625,20 +2670,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = !!mm_flags_test(MMF_DISABLE_THP, me->mm); + error = prctl_get_thp_disable(arg2, arg3, arg4, arg5); break; case PR_SET_THP_DISABLE: - if (arg3 || arg4 || arg5) - return -EINVAL; - if (mmap_write_lock_killable(me->mm)) - return -EINTR; - if (arg2) - mm_flags_set(MMF_DISABLE_THP, me->mm); - else - mm_flags_clear(MMF_DISABLE_THP, me->mm); - mmap_write_unlock(me->mm); + error = prctl_set_thp_disable(arg2, arg3, arg4, arg5); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 550eb00116c5..1a416b865997 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - mm_flags_test(MMF_DISABLE_THP, mm); + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) -- cgit v1.2.3 From 1f1c061089dcd274befa0c76fb9f6e253a8368c0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:54 +0100 Subject: mm/huge_memory: convert "tva_flags" to "enum tva_type" When determining which THP orders are eligible for a VMA mapping, we have previously specified tva_flags, however it turns out it is really not necessary to treat these as flags. Rather, we distinguish between distinct modes. The only case where we previously combined flags was with TVA_ENFORCE_SYSFS, but we can avoid this by observing that this is the default, except for MADV_COLLAPSE or an edge cases in collapse_pte_mapped_thp() and hugepage_vma_revalidate(), and adding a mode specifically for this case - TVA_FORCED_COLLAPSE. We have: * smaps handling for showing "THPeligible" * Pagefault handling * khugepaged handling * Forced collapse handling: primarily MADV_COLLAPSE, but also for an edge case in collapse_pte_mapped_thp() Disregarding the edge cases, we only want to ignore sysfs settings only when we are forcing a collapse through MADV_COLLAPSE, otherwise we want to enforce it, hence this patch does the following flag to enum conversions: * TVA_SMAPS | TVA_ENFORCE_SYSFS -> TVA_SMAPS * TVA_IN_PF | TVA_ENFORCE_SYSFS -> TVA_PAGEFAULT * TVA_ENFORCE_SYSFS -> TVA_KHUGEPAGED * 0 -> TVA_FORCED_COLLAPSE With this change, we immediately know if we are in the forced collapse case, which will be valuable next. Link: https://lkml.kernel.org/r/20250815135549.130506-3-usamaarif642@gmail.com Signed-off-by: David Hildenbrand Signed-off-by: Usama Arif Acked-by: Usama Arif Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 ++-- include/linux/huge_mm.h | 30 ++++++++++++++++++------------ mm/huge_memory.c | 8 ++++---- mm/khugepaged.c | 17 ++++++++--------- mm/memory.c | 14 ++++++-------- 5 files changed, 38 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e8e7bef34531..ced01cf3c5ab 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1369,8 +1369,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS, + THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 22b8b067b295..92ea0b9771fa 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -94,12 +94,15 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define THP_ORDERS_ALL \ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) -#define TVA_SMAPS (1 << 0) /* Will be used for procfs */ -#define TVA_IN_PF (1 << 1) /* Page fault handler */ -#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ +enum tva_type { + TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ + TVA_PAGEFAULT, /* Serving a page fault. */ + TVA_KHUGEPAGED, /* Khugepaged collapse. */ + TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */ +}; -#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) +#define thp_vma_allowable_order(vma, vm_flags, type, order) \ + (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order))) #define split_folio(f) split_folio_to_list(f, NULL) @@ -264,14 +267,14 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check * @vm_flags: use these vm_flags instead of vma->vm_flags - * @tva_flags: Which TVA flags to honour + * @type: TVA type * @orders: bitfield of all orders to consider * * Calculates the intersection of the requested hugepage orders and the allowed @@ -285,11 +288,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - /* Optimization to check if required orders are enabled early. */ - if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) { + /* + * Optimization to check if required orders are enabled early. Only + * forced collapse ignores sysfs configs. + */ + if (type != TVA_FORCED_COLLAPSE && vma_is_anonymous(vma)) { unsigned long mask = READ_ONCE(huge_anon_orders_always); if (vm_flags & VM_HUGEPAGE) @@ -303,7 +309,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); + return __thp_vma_allowable_orders(vma, vm_flags, type, orders); } struct thpsize { @@ -547,7 +553,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a2f476e7419a..899d9ac86ecd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -99,12 +99,12 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - bool smaps = tva_flags & TVA_SMAPS; - bool in_pf = tva_flags & TVA_IN_PF; - bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + const bool smaps = type == TVA_SMAPS; + const bool in_pf = type == TVA_PAGEFAULT; + const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1a416b865997..d3d4f116e14b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -474,8 +474,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, - PMD_ORDER)) + if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } @@ -921,7 +920,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; - unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; + enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : + TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -932,7 +932,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1533,9 +1533,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's - * analogously elide sysfs THP settings here. + * analogously elide sysfs THP settings here and force collapse. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2432,8 +2432,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, - TVA_ENFORCE_SYSFS, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; @@ -2767,7 +2766,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/memory.c b/mm/memory.c index 002c28795d8b..7b1e8f137fa3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4515,8 +4515,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); @@ -5063,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -6254,8 +6254,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -6289,8 +6288,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; -- cgit v1.2.3 From 8cdc4d27019356b0304308eb799484c899b62a87 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:55 +0100 Subject: mm/huge_memory: respect MADV_COLLAPSE with PR_THP_DISABLE_EXCEPT_ADVISED Let's allow for making MADV_COLLAPSE succeed on areas that neither have VM_HUGEPAGE nor VM_NOHUGEPAGE when we have THP disabled unless explicitly advised (PR_THP_DISABLE_EXCEPT_ADVISED). MADV_COLLAPSE is a clear advice that we want to collapse. Note that we still respect the VM_NOHUGEPAGE flag, just like MADV_COLLAPSE always does. So consequently, MADV_COLLAPSE is now only refused on VM_NOHUGEPAGE with PR_THP_DISABLE_EXCEPT_ADVISED, including for shmem. Link: https://lkml.kernel.org/r/20250815135549.130506-4-usamaarif642@gmail.com Co-developed-by: Usama Arif Signed-off-by: Usama Arif Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 +++++++- include/uapi/linux/prctl.h | 2 +- mm/huge_memory.c | 5 +++-- mm/memory.c | 6 ++++-- mm/shmem.c | 2 +- 5 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 92ea0b9771fa..1ac0d06fb3c1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -329,7 +329,7 @@ struct thpsize { * through madvise or prctl. */ static inline bool vma_thp_disabled(struct vm_area_struct *vma, - vm_flags_t vm_flags) + vm_flags_t vm_flags, bool forced_collapse) { /* Are THPs disabled for this VMA? */ if (vm_flags & VM_NOHUGEPAGE) @@ -343,6 +343,12 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, */ if (vm_flags & VM_HUGEPAGE) return false; + /* + * Forcing a collapse (e.g., madv_collapse), is a clear advice to + * use THPs. + */ + if (forced_collapse) + return false; return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 150b6deebfb1..51c4e8c82b1e 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -185,7 +185,7 @@ struct prctl_mm_map { #define PR_SET_THP_DISABLE 41 /* * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / - * VM_HUGEPAGE). + * VM_HUGEPAGE, MADV_COLLAPSE). */ # define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 899d9ac86ecd..d89992b65acc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -104,7 +104,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, { const bool smaps = type == TVA_SMAPS; const bool in_pf = type == TVA_PAGEFAULT; - const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; + const bool enforce_sysfs = !forced_collapse; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ @@ -122,7 +123,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ diff --git a/mm/memory.c b/mm/memory.c index 7b1e8f137fa3..d9de6c056179 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5332,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any - * PMD mappings if THPs are disabled. + * PMD mappings if THPs are disabled. As we already have a THP, + * behave as if we are forcing a collapse. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + /* forced_collapse=*/ true)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b..d945de3a7f0e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1817,7 +1817,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; - if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, -- cgit v1.2.3 From 53fbef56e07df822ea3029109ffca25328c2e5ac Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:51 +0100 Subject: mm: introduce memdesc_flags_t Patch series "Add and use memdesc_flags_t". At some point struct page will be separated from struct slab and struct folio. This is a step towards that by introducing a type for the 'flags' word of all three structures. This gives us a certain amount of type safety by establishing that some of these unsigned longs are different from other unsigned longs in that they contain things like node ID, section number and zone number in the upper bits. That lets us have functions that can be easily called by anyone who has a slab, folio or page (but not easily by anyone else) to get the node or zone. There's going to be some unusual merge problems with this as some odd bits of the kernel decide they want to print out the flags value or something similar by writing page->flags and now they'll need to write page->flags.f instead. That's most of the churn here. Maybe we should be removing these things from the debug output? This patch (of 11): Wrap the unsigned long flags in a typedef. In upcoming patches, this will provide a strong hint that you can't just pass a random unsigned long to functions which take this as an argument. [willy@infradead.org: s/flags/flags.f/ in several architectures] Link: https://lkml.kernel.org/r/aKMgPRLD-WnkPxYm@casper.infradead.org [nicola.vetrini@gmail.com: mips: fix compilation error] Link: https://lore.kernel.org/lkml/CA+G9fYvkpmqGr6wjBNHY=dRp71PLCoi2341JxOudi60yqaeUdg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20250825214245.1838158-1-nicola.vetrini@gmail.com Link: https://lkml.kernel.org/r/20250805172307.1302730-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250805172307.1302730-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- arch/arc/mm/cache.c | 8 +++---- arch/arc/mm/tlb.c | 2 +- arch/arm/include/asm/hugetlb.h | 2 +- arch/arm/mm/copypage-v4mc.c | 2 +- arch/arm/mm/copypage-v6.c | 2 +- arch/arm/mm/copypage-xscale.c | 2 +- arch/arm/mm/dma-mapping.c | 2 +- arch/arm/mm/fault-armv.c | 2 +- arch/arm/mm/flush.c | 10 ++++----- arch/arm64/include/asm/hugetlb.h | 6 ++--- arch/arm64/include/asm/mte.h | 16 +++++++------- arch/arm64/mm/flush.c | 8 +++---- arch/csky/abiv1/cacheflush.c | 6 ++--- arch/mips/include/asm/cacheflush.h | 6 ++--- arch/nios2/mm/cacheflush.c | 6 ++--- arch/openrisc/include/asm/cacheflush.h | 2 +- arch/openrisc/mm/cache.c | 2 +- arch/parisc/kernel/cache.c | 6 ++--- arch/powerpc/include/asm/cacheflush.h | 4 ++-- arch/powerpc/include/asm/kvm_ppc.h | 4 ++-- arch/powerpc/mm/book3s64/hash_utils.c | 4 ++-- arch/powerpc/mm/pgtable.c | 12 +++++----- arch/riscv/include/asm/cacheflush.h | 4 ++-- arch/riscv/include/asm/hugetlb.h | 2 +- arch/riscv/mm/cacheflush.c | 4 ++-- arch/s390/include/asm/hugetlb.h | 2 +- arch/s390/kernel/uv.c | 12 +++++----- arch/s390/mm/gmap.c | 2 +- arch/s390/mm/hugetlbpage.c | 2 +- arch/sh/include/asm/hugetlb.h | 2 +- arch/sh/mm/cache-sh4.c | 2 +- arch/sh/mm/cache-sh7705.c | 2 +- arch/sh/mm/cache.c | 14 ++++++------ arch/sh/mm/kmap.c | 2 +- arch/sparc/mm/init_64.c | 10 ++++----- arch/x86/mm/pat/memtype.c | 6 ++--- arch/xtensa/mm/cache.c | 12 +++++----- fs/fuse/dev.c | 2 +- fs/gfs2/glops.c | 2 +- fs/jffs2/file.c | 4 ++-- fs/nilfs2/page.c | 2 +- fs/proc/page.c | 4 ++-- fs/ubifs/file.c | 6 ++--- include/linux/mm.h | 32 +++++++++++++-------------- include/linux/mm_inline.h | 12 +++++----- include/linux/mm_types.h | 8 +++++-- include/linux/mmzone.h | 2 +- include/linux/page-flags.h | 40 +++++++++++++++++----------------- include/linux/pgalloc_tag.h | 7 +++--- include/trace/events/page_ref.h | 4 ++-- mm/filemap.c | 8 +++---- mm/huge_memory.c | 4 ++-- mm/memory-failure.c | 12 +++++----- mm/mmzone.c | 4 ++-- mm/page_alloc.c | 12 +++++----- mm/swap.c | 8 +++---- mm/vmscan.c | 18 +++++++-------- mm/workingset.c | 2 +- 58 files changed, 195 insertions(+), 190 deletions(-) (limited to 'include/linux') diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 9106ceac323c..7d2f93dc1e91 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -704,7 +704,7 @@ static inline void arc_slc_enable(void) void flush_dcache_folio(struct folio *folio) { - clear_bit(PG_dc_clean, &folio->flags); + clear_bit(PG_dc_clean, &folio->flags.f); return; } EXPORT_SYMBOL(flush_dcache_folio); @@ -889,8 +889,8 @@ void copy_user_highpage(struct page *to, struct page *from, copy_page(kto, kfrom); - clear_bit(PG_dc_clean, &dst->flags); - clear_bit(PG_dc_clean, &src->flags); + clear_bit(PG_dc_clean, &dst->flags.f); + clear_bit(PG_dc_clean, &src->flags.f); kunmap_atomic(kto); kunmap_atomic(kfrom); @@ -900,7 +900,7 @@ void clear_user_page(void *to, unsigned long u_vaddr, struct page *page) { struct folio *folio = page_folio(page); clear_page(to); - clear_bit(PG_dc_clean, &folio->flags); + clear_bit(PG_dc_clean, &folio->flags.f); } EXPORT_SYMBOL(clear_user_page); diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index cae4a7aae0ed..ed6915ba76ec 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -488,7 +488,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, */ if (vma->vm_flags & VM_EXEC) { struct folio *folio = page_folio(page); - int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags); + int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags.f); if (dirty) { unsigned long offset = offset_in_folio(folio, paddr); nr = folio_nr_pages(folio); diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index b766c4b373f6..700055b1ccb3 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -17,7 +17,7 @@ static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index 7ddd82b9fe8b..ed843bb22020 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -67,7 +67,7 @@ void v4_mc_copy_user_highpage(struct page *to, struct page *from, struct folio *src = page_folio(from); void *kto = kmap_atomic(to); - if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + if (!test_and_set_bit(PG_dcache_clean, &src->flags.f)) __flush_dcache_folio(folio_flush_mapping(src), src); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c index a1a71f36d850..0710dba5c0bf 100644 --- a/arch/arm/mm/copypage-v6.c +++ b/arch/arm/mm/copypage-v6.c @@ -73,7 +73,7 @@ static void v6_copy_user_highpage_aliasing(struct page *to, unsigned int offset = CACHE_COLOUR(vaddr); unsigned long kfrom, kto; - if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + if (!test_and_set_bit(PG_dcache_clean, &src->flags.f)) __flush_dcache_folio(folio_flush_mapping(src), src); /* FIXME: not highmem safe */ diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c index f1e29d3e8193..e16af68d709f 100644 --- a/arch/arm/mm/copypage-xscale.c +++ b/arch/arm/mm/copypage-xscale.c @@ -87,7 +87,7 @@ void xscale_mc_copy_user_highpage(struct page *to, struct page *from, struct folio *src = page_folio(from); void *kto = kmap_atomic(to); - if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + if (!test_and_set_bit(PG_dcache_clean, &src->flags.f)) __flush_dcache_folio(folio_flush_mapping(src), src); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 88c2d68a69c9..08641a936394 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -718,7 +718,7 @@ static void __dma_page_dev_to_cpu(struct page *page, unsigned long off, if (size < sz) break; if (!offset) - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); offset = 0; size -= sz; if (!size) diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 39fd5df73317..91e488767783 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -203,7 +203,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, folio = page_folio(pfn_to_page(pfn)); mapping = folio_flush_mapping(folio); - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) __flush_dcache_folio(mapping, folio); if (mapping) { if (cache_is_vivt()) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 5219158d54cf..19470d938b23 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -304,7 +304,7 @@ void __sync_icache_dcache(pte_t pteval) else mapping = NULL; - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) __flush_dcache_folio(mapping, folio); if (pte_exec(pteval)) @@ -343,8 +343,8 @@ void flush_dcache_folio(struct folio *folio) return; if (!cache_ops_need_broadcast() && cache_is_vipt_nonaliasing()) { - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); return; } @@ -352,14 +352,14 @@ void flush_dcache_folio(struct folio *folio) if (!cache_ops_need_broadcast() && mapping && !folio_mapped(folio)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else { __flush_dcache_folio(mapping, folio); if (mapping && cache_is_vivt()) __flush_dcache_aliases(mapping, folio); else if (mapping) __flush_icache_all(); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL(flush_dcache_folio); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 2a8155c4a882..44c1f757bfcf 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -21,12 +21,12 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h); static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); #ifdef CONFIG_ARM64_MTE if (system_supports_mte()) { - clear_bit(PG_mte_tagged, &folio->flags); - clear_bit(PG_mte_lock, &folio->flags); + clear_bit(PG_mte_tagged, &folio->flags.f); + clear_bit(PG_mte_lock, &folio->flags.f); } #endif } diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 6567df8ec8ca..3b5069f4683d 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -48,12 +48,12 @@ static inline void set_page_mte_tagged(struct page *page) * before the page flags update. */ smp_wmb(); - set_bit(PG_mte_tagged, &page->flags); + set_bit(PG_mte_tagged, &page->flags.f); } static inline bool page_mte_tagged(struct page *page) { - bool ret = test_bit(PG_mte_tagged, &page->flags); + bool ret = test_bit(PG_mte_tagged, &page->flags.f); VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); @@ -82,7 +82,7 @@ static inline bool try_page_mte_tagging(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); - if (!test_and_set_bit(PG_mte_lock, &page->flags)) + if (!test_and_set_bit(PG_mte_lock, &page->flags.f)) return true; /* @@ -90,7 +90,7 @@ static inline bool try_page_mte_tagging(struct page *page) * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ - smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged)); + smp_cond_load_acquire(&page->flags.f, VAL & (1UL << PG_mte_tagged)); return false; } @@ -173,13 +173,13 @@ static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) * before the folio flags update. */ smp_wmb(); - set_bit(PG_mte_tagged, &folio->flags); + set_bit(PG_mte_tagged, &folio->flags.f); } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { - bool ret = test_bit(PG_mte_tagged, &folio->flags); + bool ret = test_bit(PG_mte_tagged, &folio->flags.f); VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); @@ -196,7 +196,7 @@ static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); - if (!test_and_set_bit(PG_mte_lock, &folio->flags)) + if (!test_and_set_bit(PG_mte_lock, &folio->flags.f)) return true; /* @@ -204,7 +204,7 @@ static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ - smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged)); + smp_cond_load_acquire(&folio->flags.f, VAL & (1UL << PG_mte_tagged)); return false; } diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 013eead9b695..fbf08b543c3f 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -53,11 +53,11 @@ void __sync_icache_dcache(pte_t pte) { struct folio *folio = page_folio(pte_page(pte)); - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { sync_icache_aliases((unsigned long)folio_address(folio), (unsigned long)folio_address(folio) + folio_size(folio)); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL_GPL(__sync_icache_dcache); @@ -69,8 +69,8 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); */ void flush_dcache_folio(struct folio *folio) { - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); } EXPORT_SYMBOL(flush_dcache_folio); diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 171e8fb32285..4bc0aad3cf8a 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -25,12 +25,12 @@ void flush_dcache_folio(struct folio *folio) mapping = folio_flush_mapping(folio); if (mapping && !folio_mapped(folio)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else { dcache_wbinv_all(); if (mapping) icache_inv_all(); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL(flush_dcache_folio); @@ -56,7 +56,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, return; folio = page_folio(pfn_to_page(pfn)); - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) dcache_wbinv_all(); if (folio_flush_mapping(folio)) { diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index 1f14132b3fc9..5d283ef89d90 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -37,11 +37,11 @@ #define PG_dcache_dirty PG_arch_1 #define folio_test_dcache_dirty(folio) \ - test_bit(PG_dcache_dirty, &(folio)->flags) + test_bit(PG_dcache_dirty, &(folio)->flags.f) #define folio_set_dcache_dirty(folio) \ - set_bit(PG_dcache_dirty, &(folio)->flags) + set_bit(PG_dcache_dirty, &(folio)->flags.f) #define folio_clear_dcache_dirty(folio) \ - clear_bit(PG_dcache_dirty, &(folio)->flags) + clear_bit(PG_dcache_dirty, &(folio)->flags.f) extern void (*flush_cache_all)(void); extern void (*__flush_cache_all)(void); diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c index 0ee9c5f02e08..8321182eb927 100644 --- a/arch/nios2/mm/cacheflush.c +++ b/arch/nios2/mm/cacheflush.c @@ -187,7 +187,7 @@ void flush_dcache_folio(struct folio *folio) /* Flush this page if there are aliases. */ if (mapping && !mapping_mapped(mapping)) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } else { __flush_dcache_folio(folio); if (mapping) { @@ -195,7 +195,7 @@ void flush_dcache_folio(struct folio *folio) flush_aliases(mapping, folio); flush_icache_range(start, start + folio_size(folio)); } - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL(flush_dcache_folio); @@ -227,7 +227,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, return; folio = page_folio(pfn_to_page(pfn)); - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) __flush_dcache_folio(folio); mapping = folio_flush_mapping(folio); diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index 0e60af486ec1..cd8f971c0fec 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -75,7 +75,7 @@ static inline void sync_icache_dcache(struct page *page) static inline void flush_dcache_folio(struct folio *folio) { - clear_bit(PG_dc_clean, &folio->flags); + clear_bit(PG_dc_clean, &folio->flags.f); } #define flush_dcache_folio flush_dcache_folio diff --git a/arch/openrisc/mm/cache.c b/arch/openrisc/mm/cache.c index 0f265b8e73ec..f33df46dae4e 100644 --- a/arch/openrisc/mm/cache.c +++ b/arch/openrisc/mm/cache.c @@ -83,7 +83,7 @@ void update_cache(struct vm_area_struct *vma, unsigned long address, { unsigned long pfn = pte_val(*pte) >> PAGE_SHIFT; struct folio *folio = page_folio(pfn_to_page(pfn)); - int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags); + int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags.f); /* * Since icaches do not snoop for updated data on OpenRISC, we diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 37ca484cc495..4c5240d3a3c7 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -122,10 +122,10 @@ void __update_cache(pte_t pte) pfn = folio_pfn(folio); nr = folio_nr_pages(folio); if (folio_flush_mapping(folio) && - test_bit(PG_dcache_dirty, &folio->flags)) { + test_bit(PG_dcache_dirty, &folio->flags.f)) { while (nr--) flush_kernel_dcache_page_addr(pfn_va(pfn + nr)); - clear_bit(PG_dcache_dirty, &folio->flags); + clear_bit(PG_dcache_dirty, &folio->flags.f); } else if (parisc_requires_coherency()) while (nr--) flush_kernel_dcache_page_addr(pfn_va(pfn + nr)); @@ -481,7 +481,7 @@ void flush_dcache_folio(struct folio *folio) pgoff_t pgoff; if (mapping && !mapping_mapped(mapping)) { - set_bit(PG_dcache_dirty, &folio->flags); + set_bit(PG_dcache_dirty, &folio->flags.f); return; } diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index f2656774aaa9..1fea42928f64 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -40,8 +40,8 @@ static inline void flush_dcache_folio(struct folio *folio) if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) return; /* avoid an atomic op if possible */ - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); } #define flush_dcache_folio flush_dcache_folio diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ca3829d47ab7..0953f2daa466 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -939,9 +939,9 @@ static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn) /* Clear i-cache for new pages */ folio = page_folio(pfn_to_page(pfn)); - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 4693c464fc5a..3aee3af614af 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1562,11 +1562,11 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) folio = page_folio(pte_page(pte)); /* page is dirty */ - if (!test_bit(PG_dcache_clean, &folio->flags) && + if (!test_bit(PG_dcache_clean, &folio->flags.f) && !folio_test_reserved(folio)) { if (trap == INTERRUPT_INST_STORAGE) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } else pp |= HPTE_R_N; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index dfaa9fd86f7e..56d7e8960e77 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -87,9 +87,9 @@ static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) struct folio *folio = maybe_pte_to_folio(pte); if (!folio) return pte; - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } return pte; @@ -127,13 +127,13 @@ static inline pte_t set_pte_filter(pte_t pte, unsigned long addr) return pte; /* If the page clean, we move on */ - if (test_bit(PG_dcache_clean, &folio->flags)) + if (test_bit(PG_dcache_clean, &folio->flags.f)) return pte; /* If it's an exec fault, we flush the cache and make it clean */ if (is_exec_fault()) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); return pte; } @@ -175,12 +175,12 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, goto bail; /* If the page is already clean, we move on */ - if (test_bit(PG_dcache_clean, &folio->flags)) + if (test_bit(PG_dcache_clean, &folio->flags.f)) goto bail; /* Clean the page and set PG_dcache_clean */ flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); bail: return pte_mkexec(pte); diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 6086b38d5427..0092513c3376 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -23,8 +23,8 @@ static inline void local_flush_icache_range(unsigned long start, static inline void flush_dcache_folio(struct folio *folio) { - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); } #define flush_dcache_folio flush_dcache_folio #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 446126497768..0872d43fc0c0 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -7,7 +7,7 @@ static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 4ca5aafce22e..d83a612464f6 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -101,9 +101,9 @@ void flush_icache_pte(struct mm_struct *mm, pte_t pte) { struct folio *folio = page_folio(pte_page(pte)); - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { flush_icache_mm(mm, false); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } #endif /* CONFIG_MMU */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 931fcc413598..69131736daaa 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -39,7 +39,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 47f574cd1728..93b2a01bae40 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -144,7 +144,7 @@ int uv_destroy_folio(struct folio *folio) folio_get(folio); rc = uv_destroy(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); return rc; } @@ -193,7 +193,7 @@ int uv_convert_from_secure_folio(struct folio *folio) folio_get(folio); rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); return rc; } @@ -289,7 +289,7 @@ static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) expected = expected_folio_refs(folio) + 1; if (!folio_ref_freeze(folio, expected)) return -EBUSY; - set_bit(PG_arch_1, &folio->flags); + set_bit(PG_arch_1, &folio->flags.f); /* * If the UVC does not succeed or fail immediately, we don't want to * loop for long, or we might get stall notifications. @@ -483,18 +483,18 @@ int arch_make_folio_accessible(struct folio *folio) * convert_to_secure. * As secure pages are never large folios, both variants can co-exists. */ - if (!test_bit(PG_arch_1, &folio->flags)) + if (!test_bit(PG_arch_1, &folio->flags.f)) return 0; rc = uv_pin_shared(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); return 0; } rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); return 0; } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index c7defe4ed1f6..8ff6bba107e8 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2272,7 +2272,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, start = pmd_val(*pmd) & HPAGE_MASK; end = start + HPAGE_SIZE; __storage_key_init_range(start, end); - set_bit(PG_arch_1, &folio->flags); + set_bit(PG_arch_1, &folio->flags.f); cond_resched(); return 0; } diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index e88c02c9e642..72e8fa136af5 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -155,7 +155,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) paddr = rste & PMD_MASK; } - if (!test_and_set_bit(PG_arch_1, &folio->flags)) + if (!test_and_set_bit(PG_arch_1, &folio->flags.f)) __storage_key_init_range(paddr, paddr + size); } diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 4a92e6e4d627..974512f359f0 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -14,7 +14,7 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index 46393b00137e..83fb34b39ca7 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -114,7 +114,7 @@ static void sh4_flush_dcache_folio(void *arg) struct address_space *mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else #endif { diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c index b509a407588f..71f8be9fc8e0 100644 --- a/arch/sh/mm/cache-sh7705.c +++ b/arch/sh/mm/cache-sh7705.c @@ -138,7 +138,7 @@ static void sh7705_flush_dcache_folio(void *arg) struct address_space *mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else { unsigned long pfn = folio_pfn(folio); unsigned int i, nr = folio_nr_pages(folio); diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c index 6ebdeaff3021..c3f028bed049 100644 --- a/arch/sh/mm/cache.c +++ b/arch/sh/mm/cache.c @@ -64,14 +64,14 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, struct folio *folio = page_folio(page); if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && - test_bit(PG_dcache_clean, &folio->flags)) { + test_bit(PG_dcache_clean, &folio->flags.f)) { void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(vto, src, len); kunmap_coherent(vto); } else { memcpy(dst, src, len); if (boot_cpu_data.dcache.n_aliases) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } if (vma->vm_flags & VM_EXEC) @@ -85,14 +85,14 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, struct folio *folio = page_folio(page); if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && - test_bit(PG_dcache_clean, &folio->flags)) { + test_bit(PG_dcache_clean, &folio->flags.f)) { void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(dst, vfrom, len); kunmap_coherent(vfrom); } else { memcpy(dst, src, len); if (boot_cpu_data.dcache.n_aliases) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } } @@ -105,7 +105,7 @@ void copy_user_highpage(struct page *to, struct page *from, vto = kmap_atomic(to); if (boot_cpu_data.dcache.n_aliases && folio_mapped(src) && - test_bit(PG_dcache_clean, &src->flags)) { + test_bit(PG_dcache_clean, &src->flags.f)) { vfrom = kmap_coherent(from, vaddr); copy_page(vto, vfrom); kunmap_coherent(vfrom); @@ -148,7 +148,7 @@ void __update_cache(struct vm_area_struct *vma, if (pfn_valid(pfn)) { struct folio *folio = page_folio(pfn_to_page(pfn)); - int dirty = !test_and_set_bit(PG_dcache_clean, &folio->flags); + int dirty = !test_and_set_bit(PG_dcache_clean, &folio->flags.f); if (dirty) __flush_purge_region(folio_address(folio), folio_size(folio)); @@ -162,7 +162,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr) if (pages_do_alias(addr, vmaddr)) { if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && - test_bit(PG_dcache_clean, &folio->flags)) { + test_bit(PG_dcache_clean, &folio->flags.f)) { void *kaddr; kaddr = kmap_coherent(page, vmaddr); diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c index fa50e8f6e7a9..c9f32d5a54b8 100644 --- a/arch/sh/mm/kmap.c +++ b/arch/sh/mm/kmap.c @@ -31,7 +31,7 @@ void *kmap_coherent(struct page *page, unsigned long addr) enum fixed_addresses idx; unsigned long vaddr; - BUG_ON(!test_bit(PG_dcache_clean, &folio->flags)); + BUG_ON(!test_bit(PG_dcache_clean, &folio->flags.f)); preempt_disable(); pagefault_disable(); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 7ed58bf3aaca..df9f7c444c39 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -224,7 +224,7 @@ inline void flush_dcache_folio_impl(struct folio *folio) ((1UL<flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) + (((folio)->flags.f >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) static inline void set_dcache_dirty(struct folio *folio, int this_cpu) { @@ -243,7 +243,7 @@ static inline void set_dcache_dirty(struct folio *folio, int this_cpu) "bne,pn %%xcc, 1b\n\t" " nop" : /* no outputs */ - : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags) + : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags.f) : "g1", "g7"); } @@ -265,7 +265,7 @@ static inline void clear_dcache_dirty_cpu(struct folio *folio, unsigned long cpu " nop\n" "2:" : /* no outputs */ - : "r" (cpu), "r" (mask), "r" (&folio->flags), + : "r" (cpu), "r" (mask), "r" (&folio->flags.f), "i" (PG_dcache_cpu_mask), "i" (PG_dcache_cpu_shift) : "g1", "g7"); @@ -292,7 +292,7 @@ static void flush_dcache(unsigned long pfn) struct folio *folio = page_folio(page); unsigned long pg_flags; - pg_flags = folio->flags; + pg_flags = folio->flags.f; if (pg_flags & (1UL << PG_dcache_dirty)) { int cpu = ((pg_flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask); @@ -480,7 +480,7 @@ void flush_dcache_folio(struct folio *folio) mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) { - bool dirty = test_bit(PG_dcache_dirty, &folio->flags); + bool dirty = test_bit(PG_dcache_dirty, &folio->flags.f); if (dirty) { int dirty_cpu = dcache_dirty_cpu(folio); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index c09284302dd3..b68200a0e0c6 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -126,7 +126,7 @@ __setup("debugpat", pat_debug_setup); static inline enum page_cache_mode get_page_memtype(struct page *pg) { - unsigned long pg_flags = pg->flags & _PGMT_MASK; + unsigned long pg_flags = pg->flags.f & _PGMT_MASK; if (pg_flags == _PGMT_WB) return _PAGE_CACHE_MODE_WB; @@ -161,10 +161,10 @@ static inline void set_page_memtype(struct page *pg, break; } - old_flags = READ_ONCE(pg->flags); + old_flags = READ_ONCE(pg->flags.f); do { new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; - } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&pg->flags.f, &old_flags, new_flags)); } #else static inline enum page_cache_mode get_page_memtype(struct page *pg) diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index 23be0e7516ce..5354df52d61f 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -134,8 +134,8 @@ void flush_dcache_folio(struct folio *folio) */ if (mapping && !mapping_mapped(mapping)) { - if (!test_bit(PG_arch_1, &folio->flags)) - set_bit(PG_arch_1, &folio->flags); + if (!test_bit(PG_arch_1, &folio->flags.f)) + set_bit(PG_arch_1, &folio->flags.f); return; } else { @@ -232,7 +232,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, #if (DCACHE_WAY_SIZE > PAGE_SIZE) - if (!folio_test_reserved(folio) && test_bit(PG_arch_1, &folio->flags)) { + if (!folio_test_reserved(folio) && test_bit(PG_arch_1, &folio->flags.f)) { unsigned long phys = folio_pfn(folio) * PAGE_SIZE; unsigned long tmp; @@ -247,10 +247,10 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, } preempt_enable(); - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); } #else - if (!folio_test_reserved(folio) && !test_bit(PG_arch_1, &folio->flags) + if (!folio_test_reserved(folio) && !test_bit(PG_arch_1, &folio->flags.f) && (vma->vm_flags & VM_EXEC) != 0) { for (i = 0; i < nr; i++) { void *paddr = kmap_local_folio(folio, i * PAGE_SIZE); @@ -258,7 +258,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, __invalidate_icache_page((unsigned long)paddr); kunmap_local(paddr); } - set_bit(PG_arch_1, &folio->flags); + set_bit(PG_arch_1, &folio->flags.f); } #endif } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e80cd8f2c049..8a89f0aa1d4d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -935,7 +935,7 @@ static int fuse_check_folio(struct folio *folio) { if (folio_mapped(folio) || folio->mapping != NULL || - (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & + (folio->flags.f & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_lru | diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index fe0faad4892f..0c0a80b3baca 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -40,7 +40,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page " "state 0x%lx\n", bh, (unsigned long long)bh->b_blocknr, bh->b_state, - bh->b_folio->mapping, bh->b_folio->flags); + bh->b_folio->mapping, bh->b_folio->flags.f); fs_err(sdp, "AIL glock %u:%llu mapping %p\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gfs2_glock2aspace(gl)); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index dd3dff95cb24..b697f3c259ef 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -230,7 +230,7 @@ static int jffs2_write_begin(const struct kiocb *iocb, goto release_sem; } } - jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags); + jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags.f); release_sem: mutex_unlock(&c->alloc_sem); @@ -259,7 +259,7 @@ static int jffs2_write_end(const struct kiocb *iocb, jffs2_dbg(1, "%s(): ino #%lu, page at 0x%llx, range %d-%d, flags %lx\n", __func__, inode->i_ino, folio_pos(folio), - start, end, folio->flags); + start, end, folio->flags.f); /* We need to avoid deadlock with page_cache_read() in jffs2_garbage_collect_pass(). So the folio must be diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 806b056d2260..56c4da417b6a 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -167,7 +167,7 @@ void nilfs_folio_bug(struct folio *folio) printk(KERN_CRIT "NILFS_FOLIO_BUG(%p): cnt=%d index#=%llu flags=0x%lx " "mapping=%p ino=%lu\n", folio, folio_ref_count(folio), - (unsigned long long)folio->index, folio->flags, m, ino); + (unsigned long long)folio->index, folio->flags.f, m, ino); head = folio_buffers(folio); if (head) { diff --git a/fs/proc/page.c b/fs/proc/page.c index ba3568e97fd1..771e0b6bc630 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -163,7 +163,7 @@ u64 stable_page_flags(const struct page *page) snapshot_page(&ps, page); folio = &ps.folio_snapshot; - k = folio->flags; + k = folio->flags.f; mapping = (unsigned long)folio->mapping; is_anon = mapping & FOLIO_MAPPING_ANON; @@ -238,7 +238,7 @@ u64 stable_page_flags(const struct page *page) if (u & (1 << KPF_HUGE)) u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); else - u |= kpf_copy_bit(ps.page_snapshot.flags, KPF_HWPOISON, PG_hwpoison); + u |= kpf_copy_bit(ps.page_snapshot.flags.f, KPF_HWPOISON, PG_hwpoison); #endif u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index e75a6cec67be..ca41ce8208c4 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -107,7 +107,7 @@ static int do_readpage(struct folio *folio) size_t offset = 0; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, folio->index, i_size, folio->flags); + inode->i_ino, folio->index, i_size, folio->flags.f); ubifs_assert(c, !folio_test_checked(folio)); ubifs_assert(c, !folio->private); @@ -600,7 +600,7 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, pgoff_t end_index; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, folio->index, i_size, folio->flags); + inode->i_ino, folio->index, i_size, folio->flags.f); end_index = (i_size - 1) >> PAGE_SHIFT; if (!i_size || folio->index > end_index) { @@ -988,7 +988,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc) int err, len = folio_size(folio); dbg_gen("ino %lu, pg %lu, pg flags %#lx", - inode->i_ino, folio->index, folio->flags); + inode->i_ino, folio->index, folio->flags.f); ubifs_assert(c, folio->private != NULL); /* Is the folio fully outside @i_size? (truncate in progress) */ diff --git a/include/linux/mm.h b/include/linux/mm.h index b61e2d4858cf..da562f23f50c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1024,7 +1024,7 @@ static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; - if (!test_bit(PG_head, &folio->flags)) + if (!test_bit(PG_head, &folio->flags.f)) return 0; return folio_large_order(folio); } @@ -1554,7 +1554,7 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) */ static inline int page_zone_id(struct page *page) { - return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; + return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -1562,7 +1562,7 @@ int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { - return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; + return (PF_POISONED_CHECK(page)->flags.f >> NODES_PGSHIFT) & NODES_MASK; } #endif @@ -1637,14 +1637,14 @@ static inline void page_cpupid_reset_last(struct page *page) #else static inline int folio_last_cpupid(struct folio *folio) { - return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; + return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { - page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; + page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ @@ -1740,7 +1740,7 @@ static inline u8 page_kasan_tag(const struct page *page) u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { - tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; } @@ -1755,12 +1755,12 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag) return; tag ^= 0xff; - old_flags = READ_ONCE(page->flags); + old_flags = READ_ONCE(page->flags.f); do { flags = old_flags; flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; - } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) @@ -1804,13 +1804,13 @@ static inline pg_data_t *folio_pgdat(const struct folio *folio) #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { - page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); - page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; + page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); + page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } static inline unsigned long page_to_section(const struct page *page) { - return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; + return (page->flags.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif @@ -2015,14 +2015,14 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio) static inline void set_page_zone(struct page *page, enum zone_type zone) { - page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); - page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; + page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT); + page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { - page->flags &= ~(NODES_MASK << NODES_PGSHIFT); - page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; + page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT); + page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, @@ -2064,7 +2064,7 @@ static inline long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; - if (!test_bit(PG_head, &folio->flags)) + if (!test_bit(PG_head, &folio->flags.f)) return 1; return folio_large_nr_pages(folio); } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 89b518ff097e..150302b4a905 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -143,7 +143,7 @@ static inline int lru_tier_from_refs(int refs, bool workingset) static inline int folio_lru_refs(struct folio *folio) { - unsigned long flags = READ_ONCE(folio->flags); + unsigned long flags = READ_ONCE(folio->flags.f); if (!(flags & BIT(PG_referenced))) return 0; @@ -156,7 +156,7 @@ static inline int folio_lru_refs(struct folio *folio) static inline int folio_lru_gen(struct folio *folio) { - unsigned long flags = READ_ONCE(folio->flags); + unsigned long flags = READ_ONCE(folio->flags.f); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } @@ -268,7 +268,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); + set_mask_bits(&folio->flags.f, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ @@ -293,7 +293,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; - flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); + flags = set_mask_bits(&folio->flags.f, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); @@ -304,9 +304,9 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, static inline void folio_migrate_refs(struct folio *new, struct folio *old) { - unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; + unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK; - set_mask_bits(&new->flags, LRU_REFS_MASK, refs); + set_mask_bits(&new->flags.f, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d247da2fdb52..d934a3a5b443 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -34,6 +34,10 @@ struct address_space; struct futex_private_hash; struct mem_cgroup; +typedef struct { + unsigned long f; +} memdesc_flags_t; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -72,7 +76,7 @@ struct mem_cgroup; #endif struct page { - unsigned long flags; /* Atomic flags, some possibly + memdesc_flags_t flags; /* Atomic flags, some possibly * updated asynchronously */ /* * Five words (20/40 bytes) are available in this union. @@ -383,7 +387,7 @@ struct folio { union { struct { /* public: */ - unsigned long flags; + memdesc_flags_t flags; union { struct list_head lru; /* private: avoid cluttering the output */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9d3ea9085556..990560cd99ee 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1186,7 +1186,7 @@ static inline bool zone_is_empty(struct zone *zone) static inline enum zone_type page_zonenum(const struct page *page) { ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; + return (page->flags.f >> ZONES_PGSHIFT) & ZONES_MASK; } static inline enum zone_type folio_zonenum(const struct folio *folio) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8d3fa3a91ce4..d53a86e68c89 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -217,7 +217,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && - test_bit(PG_head, &page->flags)) { + test_bit(PG_head, &page->flags.f)) { /* * We can safely access the field of the @page[1] with PG_head * because the @page is a compound page composed with at least @@ -325,14 +325,14 @@ static __always_inline int PageTail(const struct page *page) static __always_inline int PageCompound(const struct page *page) { - return test_bit(PG_head, &page->flags) || + return test_bit(PG_head, &page->flags.f) || READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { - return READ_ONCE(page->flags) == PAGE_POISON_PATTERN; + return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM @@ -349,8 +349,8 @@ static const unsigned long *const_folio_flags(const struct folio *folio, const struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); - VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); - return &page[n].flags; + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); + return &page[n].flags.f; } static unsigned long *folio_flags(struct folio *folio, unsigned n) @@ -358,8 +358,8 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); - VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); - return &page[n].flags; + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); + return &page[n].flags.f; } /* @@ -449,37 +449,37 @@ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(const struct page *page) \ -{ return test_bit(PG_##lname, &policy(page, 0)->flags); } +{ return test_bit(PG_##lname, &policy(page, 0)->flags.f); } #define SETPAGEFLAG(uname, lname, policy) \ FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ -{ set_bit(PG_##lname, &policy(page, 1)->flags); } +{ set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define CLEARPAGEFLAG(uname, lname, policy) \ FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ -{ clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __SETPAGEFLAG(uname, lname, policy) \ __FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ -{ __set_bit(PG_##lname, &policy(page, 1)->flags); } +{ __set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __CLEARPAGEFLAG(uname, lname, policy) \ __FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ -{ __clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ __clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTSETFLAG(uname, lname, policy) \ FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ -{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTCLEARFLAG(uname, lname, policy) \ FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ -{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ @@ -846,7 +846,7 @@ static __always_inline bool folio_test_head(const struct folio *folio) static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); - return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); + return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); } __SETPAGEFLAG(Head, head, PF_ANY) @@ -1170,28 +1170,28 @@ static __always_inline int PageAnonExclusive(const struct page *page) */ if (PageHuge(page)) page = compound_head(page); - return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void SetPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void __ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } #ifdef CONFIG_MMU @@ -1241,7 +1241,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ static inline int folio_has_private(const struct folio *folio) { - return !!(folio->flags & PAGE_FLAGS_PRIVATE); + return !!(folio->flags.f & PAGE_FLAGS_PRIVATE); } #undef PF_ANY diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 8a7f4f802c57..38a82d65e58e 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -107,7 +107,8 @@ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, if (static_key_enabled(&mem_profiling_compressed)) { pgalloc_tag_idx idx; - idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask; + idx = (page->flags.f >> alloc_tag_ref_offs) & + alloc_tag_ref_mask; idx_to_ref(idx, ref); handle->page = page; } else { @@ -149,11 +150,11 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code idx = (unsigned long)ref_to_idx(ref); idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs; do { - old_flags = READ_ONCE(page->flags); + old_flags = READ_ONCE(page->flags.f); flags = old_flags; flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs); flags |= idx; - } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags))); } else { if (WARN_ON(!handle.ref || !ref)) return; diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h index fe33a255b7d0..ea6b5c4baf3d 100644 --- a/include/trace/events/page_ref.h +++ b/include/trace/events/page_ref.h @@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template, TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->flags = page->flags; + __entry->flags = page->flags.f; __entry->count = page_ref_count(page); __entry->mapcount = atomic_read(&page->_mapcount); __entry->mapping = page->mapping; @@ -77,7 +77,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template, TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->flags = page->flags; + __entry->flags = page->flags.f; __entry->count = page_ref_count(page); __entry->mapcount = atomic_read(&page->_mapcount); __entry->mapping = page->mapping; diff --git a/mm/filemap.c b/mm/filemap.c index 1a388b11cfa9..f3a6c24897f4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1140,10 +1140,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { - if (test_bit(key->bit_nr, &key->folio->flags)) + if (test_bit(key->bit_nr, &key->folio->flags.f)) return -1; if (flags & WQ_FLAG_CUSTOM) { - if (test_and_set_bit(key->bit_nr, &key->folio->flags)) + if (test_and_set_bit(key->bit_nr, &key->folio->flags.f)) return -1; flags |= WQ_FLAG_DONE; } @@ -1226,9 +1226,9 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(bit_nr, &folio->flags)) + if (test_and_set_bit(bit_nr, &folio->flags.f)) return false; - } else if (test_bit(bit_nr, &folio->flags)) + } else if (test_bit(bit_nr, &folio->flags.f)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d89992b65acc..aac5f0a2cb54 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3303,8 +3303,8 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * unreferenced sub-pages of an anonymous THP: we can simply drop * PG_anon_exclusive (-> PG_mappedtodisk) for these here. */ - new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - new_folio->flags |= (folio->flags & + new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; + new_folio->flags.f |= (folio->flags.f & ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_swapcache) | diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fc30ca4804bf..c15ffee7d32b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1707,10 +1707,10 @@ static int identify_page_state(unsigned long pfn, struct page *p, * carried out only if the first check can't determine the page status. */ for (ps = error_states;; ps++) - if ((p->flags & ps->mask) == ps->res) + if ((p->flags.f & ps->mask) == ps->res) break; - page_flags |= (p->flags & (1UL << PG_dirty)); + page_flags |= (p->flags.f & (1UL << PG_dirty)); if (!ps->mask) for (ps = error_states;; ps++) @@ -2137,7 +2137,7 @@ retry: return action_result(pfn, MF_MSG_FREE_HUGE, res); } - page_flags = folio->flags; + page_flags = folio->flags.f; if (!hwpoison_user_mappings(folio, p, pfn, flags)) { folio_unlock(folio); @@ -2398,7 +2398,7 @@ try_again: * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page * status correctly, we save a copy of the page flags at this time. */ - page_flags = folio->flags; + page_flags = folio->flags.f; /* * __munlock_folio() may clear a writeback folio's LRU flag without @@ -2744,13 +2744,13 @@ static int soft_offline_in_use_page(struct page *page) putback_movable_pages(&pagelist); pr_info("%#lx: %s migration failed %ld, type %pGp\n", - pfn, msg_page[huge], ret, &page->flags); + pfn, msg_page[huge], ret, &page->flags.f); if (ret > 0) ret = -EBUSY; } } else { pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n", - pfn, msg_page[huge], page_count(page), &page->flags); + pfn, msg_page[huge], page_count(page), &page->flags.f); ret = -EBUSY; } return ret; diff --git a/mm/mmzone.c b/mm/mmzone.c index f9baa8882fbf..0c8f181d9d50 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -99,14 +99,14 @@ int folio_xchg_last_cpupid(struct folio *folio, int cpupid) unsigned long old_flags, flags; int last_cpupid; - old_flags = READ_ONCE(folio->flags); + old_flags = READ_ONCE(folio->flags.f); do { flags = old_flags; last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&folio->flags.f, &old_flags, flags))); return last_cpupid; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2ee21e46f0fb..ca9e6b9633f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -950,7 +950,7 @@ static inline void __free_one_page(struct page *page, bool to_tail; VM_BUG_ON(!zone_is_initialized(zone)); - VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); @@ -1043,7 +1043,7 @@ static inline bool page_expected_state(struct page *page, page->memcg_data | #endif page_pool_page_is_pp(page) | - (page->flags & check_flags))) + (page->flags.f & check_flags))) return false; return true; @@ -1059,7 +1059,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & flags)) { + if (unlikely(page->flags.f & flags)) { if (flags == PAGE_FLAGS_CHECK_AT_PREP) bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; else @@ -1358,7 +1358,7 @@ __always_inline bool free_pages_prepare(struct page *page, int i; if (compound) { - page[1].flags &= ~PAGE_FLAGS_SECOND; + page[1].flags.f &= ~PAGE_FLAGS_SECOND; #ifdef NR_PAGES_IN_LARGE_FOLIO folio->_nr_pages = 0; #endif @@ -1372,7 +1372,7 @@ __always_inline bool free_pages_prepare(struct page *page, continue; } } - (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; } } if (folio_test_anon(folio)) { @@ -1391,7 +1391,7 @@ __always_inline bool free_pages_prepare(struct page *page, } page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); page_table_check_free(page, order); pgalloc_tag_sub(page, 1 << order); diff --git a/mm/swap.c b/mm/swap.c index cb164f9ef9e3..6dd22a904b37 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -387,14 +387,14 @@ static void __lru_cache_activate_folio(struct folio *folio) static void lru_gen_inc_refs(struct folio *folio) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); if (folio_test_unevictable(folio)) return; /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return; } @@ -406,7 +406,7 @@ static void lru_gen_inc_refs(struct folio *folio) } new_flags = old_flags + BIT(LRU_REFS_PGOFF); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); } static bool lru_gen_clear_refs(struct folio *folio) @@ -418,7 +418,7 @@ static bool lru_gen_clear_refs(struct folio *folio) if (gen < 0) return true; - set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0); lrugen = &folio_lruvec(folio)->lrugen; /* whether can do without shuffling under the LRU lock */ diff --git a/mm/vmscan.c b/mm/vmscan.c index b9a1cfeb2ddf..e336577c4454 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -888,11 +888,11 @@ static bool lru_gen_set_refs(struct folio *folio) { /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return false; } - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); return true; } #else @@ -3257,13 +3257,13 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) /* promote pages accessed through page tables */ static int folio_update_gen(struct folio *folio, int gen) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return -1; } @@ -3274,7 +3274,7 @@ static int folio_update_gen(struct folio *folio, int gen) new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } @@ -3285,7 +3285,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); @@ -3302,7 +3302,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai /* for folio_end_writeback() */ if (reclaiming) new_flags |= BIT(PG_reclaim); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); lru_gen_update_size(lruvec, folio, old_gen, new_gen); @@ -4553,7 +4553,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); @@ -4766,7 +4766,7 @@ retry: /* don't add rejected folios to the oldest generation */ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); diff --git a/mm/workingset.c b/mm/workingset.c index 6e7f4cb1b9a7..68a76a91111f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -318,7 +318,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else - set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } -- cgit v1.2.3 From 56d578c1300f7efe9605b75714173dd3fda16fe2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:52 +0100 Subject: mm: convert page_to_section() to memdesc_section() Pass in the memdesc_flags_t instead of a pointer to the page. This will allow us to remove a few conversions to struct page in upcoming patches. Link: https://lkml.kernel.org/r/20250805172307.1302730-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/asm-generic/memory_model.h | 2 +- include/linux/mm.h | 4 ++-- mm/sparse.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 74d0077cc5fa..efa6610acbc7 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -53,7 +53,7 @@ static inline int pfn_valid(unsigned long pfn) */ #define __page_to_pfn(pg) \ ({ const struct page *__pg = (pg); \ - int __sec = page_to_section(__pg); \ + int __sec = memdesc_section(__pg->flags); \ (unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \ }) diff --git a/include/linux/mm.h b/include/linux/mm.h index da562f23f50c..82617c4cfa24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1808,9 +1808,9 @@ static inline void set_page_section(struct page *page, unsigned long section) page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } -static inline unsigned long page_to_section(const struct page *page) +static inline unsigned long memdesc_section(memdesc_flags_t mdf) { - return (page->flags.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; + return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif diff --git a/mm/sparse.c b/mm/sparse.c index e6075b622407..7cb42cbfc7f9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -45,7 +45,7 @@ static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; int page_to_nid(const struct page *page) { - return section_to_node_table[page_to_section(page)]; + return section_to_node_table[memdesc_section(page->flags)]; } EXPORT_SYMBOL(page_to_nid); -- cgit v1.2.3 From eb00fdd84ddabd6948d26595bb5e8c1302220d37 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:53 +0100 Subject: mm: introduce memdesc_nid() Remove a conversion from folio to page by passing the folio->flags (which are a copy of the page->flags) to the new memdesc_nid() function. Link: https://lkml.kernel.org/r/20250805172307.1302730-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mm.h | 21 +++++++++++++-------- mm/sparse.c | 6 +++--- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 82617c4cfa24..00c8a54127d3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1558,17 +1558,22 @@ static inline int page_zone_id(struct page *page) } #ifdef NODE_NOT_IN_PAGE_FLAGS -int page_to_nid(const struct page *page); +int memdesc_nid(memdesc_flags_t mdf); #else -static inline int page_to_nid(const struct page *page) +static inline int memdesc_nid(memdesc_flags_t mdf) { - return (PF_POISONED_CHECK(page)->flags.f >> NODES_PGSHIFT) & NODES_MASK; + return (mdf.f >> NODES_PGSHIFT) & NODES_MASK; } #endif +static inline int page_to_nid(const struct page *page) +{ + return memdesc_nid(PF_POISONED_CHECK(page)->flags); +} + static inline int folio_nid(const struct folio *folio) { - return page_to_nid(&folio->page); + return memdesc_nid(folio->flags); } #ifdef CONFIG_NUMA_BALANCING @@ -1791,14 +1796,14 @@ static inline pg_data_t *page_pgdat(const struct page *page) return NODE_DATA(page_to_nid(page)); } -static inline struct zone *folio_zone(const struct folio *folio) +static inline pg_data_t *folio_pgdat(const struct folio *folio) { - return page_zone(&folio->page); + return NODE_DATA(folio_nid(folio)); } -static inline pg_data_t *folio_pgdat(const struct folio *folio) +static inline struct zone *folio_zone(const struct folio *folio) { - return page_pgdat(&folio->page); + return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)]; } #ifdef SECTION_IN_PAGE_FLAGS diff --git a/mm/sparse.c b/mm/sparse.c index 7cb42cbfc7f9..17c50a6415c2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -43,11 +43,11 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -int page_to_nid(const struct page *page) +int memdesc_nid(memdesc_flags_t mdf) { - return section_to_node_table[memdesc_section(page->flags)]; + return section_to_node_table[memdesc_section(mdf)]; } -EXPORT_SYMBOL(page_to_nid); +EXPORT_SYMBOL(memdesc_nid); static void set_section_nid(unsigned long section_nr, int nid) { -- cgit v1.2.3 From 4aff03fbe508780394039053bebfc4f4800b286e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:54 +0100 Subject: mm: introduce memdesc_zonenum() Remove a conversion from folio to page by passing the folio->flags (which are a copy of the page->flags) to the new memdesc_zonenum() function. Link: https://lkml.kernel.org/r/20250805172307.1302730-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 990560cd99ee..80a3b6642603 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1183,15 +1183,20 @@ static inline bool zone_is_empty(struct zone *zone) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) +static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags) +{ + ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT); + return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK; +} + static inline enum zone_type page_zonenum(const struct page *page) { - ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags.f >> ZONES_PGSHIFT) & ZONES_MASK; + return memdesc_zonenum(page->flags); } static inline enum zone_type folio_zonenum(const struct folio *folio) { - return page_zonenum(&folio->page); + return memdesc_zonenum(folio->flags); } #ifdef CONFIG_ZONE_DEVICE -- cgit v1.2.3 From 89ef6ad6fa849b780b5a5caae9068261603e1738 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:57 +0100 Subject: mm: introduce memdesc_is_zone_device() Remove the conversion from folio to page in folio_is_zone_device() by introducing memdesc_is_zone_device() which takes a memdesc_flags_t from either a page or a folio. Link: https://lkml.kernel.org/r/20250805172307.1302730-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 80a3b6642603..fe13ad175fed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1200,14 +1200,14 @@ static inline enum zone_type folio_zonenum(const struct folio *folio) } #ifdef CONFIG_ZONE_DEVICE -static inline bool is_zone_device_page(const struct page *page) +static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { - return page_zonenum(page) == ZONE_DEVICE; + return memdesc_zonenum(mdf) == ZONE_DEVICE; } static inline struct dev_pagemap *page_pgmap(const struct page *page) { - VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page); + VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page); return page_folio(page)->pgmap; } @@ -1222,9 +1222,9 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page) static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { - if (is_zone_device_page(a) != is_zone_device_page(b)) + if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags)) return false; - if (!is_zone_device_page(a)) + if (!memdesc_is_zone_device(a->flags)) return true; return page_pgmap(a) == page_pgmap(b); } @@ -1232,7 +1232,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else -static inline bool is_zone_device_page(const struct page *page) +static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { return false; } @@ -1247,9 +1247,14 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page) } #endif +static inline bool is_zone_device_page(const struct page *page) +{ + return memdesc_is_zone_device(page->flags); +} + static inline bool folio_is_zone_device(const struct folio *folio) { - return is_zone_device_page(&folio->page); + return memdesc_is_zone_device(folio->flags); } static inline bool is_zone_movable_page(const struct page *page) -- cgit v1.2.3 From 7cfe9cafb6adebc13a246bebafcd69cd37add4e6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:58 +0100 Subject: mm: reimplement folio_is_device_private() For callers of folio_is_device_private(), we save a folio->page->folio conversion. Callers of is_device_private_page() simply move the conversion of page->folio from the implementation of page_pgmap() to is_device_private_page(). Link: https://lkml.kernel.org/r/20250805172307.1302730-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memremap.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 4aa151914eab..5d18cb7a70e5 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -157,16 +157,17 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) return 1 << pgmap->vmemmap_shift; } -static inline bool is_device_private_page(const struct page *page) +static inline bool folio_is_device_private(const struct folio *folio) { return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && - is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE; + folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_PRIVATE; } -static inline bool folio_is_device_private(const struct folio *folio) +static inline bool is_device_private_page(const struct page *page) { - return is_device_private_page(&folio->page); + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + folio_is_device_private(page_folio(page)); } static inline bool is_pci_p2pdma_page(const struct page *page) -- cgit v1.2.3 From bd0dbbb3fd902c7eea7eb166d91bda4530a8de96 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:59 +0100 Subject: mm: reimplement folio_is_device_coherent() For callers of folio_is_device_coherent(), we save a folio->page->folio conversion. Callers of is_device_coherent_page() simply move the conversion of page->folio from the implementation of page_pgmap() to is_device_coherent_page(). Link: https://lkml.kernel.org/r/20250805172307.1302730-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memremap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 5d18cb7a70e5..06d29794abe6 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -177,15 +177,15 @@ static inline bool is_pci_p2pdma_page(const struct page *page) page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; } -static inline bool is_device_coherent_page(const struct page *page) +static inline bool folio_is_device_coherent(const struct folio *folio) { - return is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_COHERENT; + return folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_COHERENT; } -static inline bool folio_is_device_coherent(const struct folio *folio) +static inline bool is_device_coherent_page(const struct page *page) { - return is_device_coherent_page(&folio->page); + return folio_is_device_coherent(page_folio(page)); } static inline bool is_fsdax_page(const struct page *page) -- cgit v1.2.3 From c995ac3aa3747ec2a0373e5f319a22e0cb31d613 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:23:00 +0100 Subject: mm: reimplement folio_is_fsdax() For callers of folio_is_fsdax(), we save a folio->page->folio conversion. Callers of is_fsdax_page() simply move the conversion of page->folio from the implementation of page_pgmap() to is_fsdax_page(). Link: https://lkml.kernel.org/r/20250805172307.1302730-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memremap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 06d29794abe6..450d4bb6835c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -188,15 +188,15 @@ static inline bool is_device_coherent_page(const struct page *page) return folio_is_device_coherent(page_folio(page)); } -static inline bool is_fsdax_page(const struct page *page) +static inline bool folio_is_fsdax(const struct folio *folio) { - return is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX; + return folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_FS_DAX; } -static inline bool folio_is_fsdax(const struct folio *folio) +static inline bool is_fsdax_page(const struct page *page) { - return is_fsdax_page(&folio->page); + return folio_is_fsdax(page_folio(page)); } #ifdef CONFIG_ZONE_DEVICE -- cgit v1.2.3 From 88df6ab2f34b60837ebdab64b2514f356d5ebb65 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:23:01 +0100 Subject: mm: add folio_is_pci_p2pdma() Reimplement is_pci_p2pdma_page() in terms of folio_is_pci_p2pdma(). Moves the page_folio() call from inside page_pgmap() to is_pci_p2pdma_page(). This removes a page_folio() call from try_grab_folio() which already has a folio and can pass it in. Link: https://lkml.kernel.org/r/20250805172307.1302730-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Shakeel Butt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memremap.h | 10 ++++++++-- mm/gup.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 450d4bb6835c..aa1b6aa877a0 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -170,11 +170,17 @@ static inline bool is_device_private_page(const struct page *page) folio_is_device_private(page_folio(page)); } +static inline bool folio_is_pci_p2pdma(const struct folio *folio) +{ + return IS_ENABLED(CONFIG_PCI_P2PDMA) && + folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && - is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; + folio_is_pci_p2pdma(page_folio(page)); } static inline bool folio_is_device_coherent(const struct folio *folio) diff --git a/mm/gup.c b/mm/gup.c index 331d22bf7b2d..b2a78f029127 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -148,7 +148,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio))) return -EREMOTEIO; if (flags & FOLL_GET) -- cgit v1.2.3 From 7bebb41b96b5a898134b757fda520b7b990a91fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Aug 2025 08:10:10 +0200 Subject: mm: remove write_cache_pages No users left. Link: https://lkml.kernel.org/r/20250818061017.1526853-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Kent Overstreet Cc: Konstantin Komarov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/writeback.h | 6 ------ mm/page-writeback.c | 30 ------------------------------ 2 files changed, 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a2848d731a46..2a7e134d03ee 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -360,12 +360,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb); struct folio *writeback_iter(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio, int *error); -typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, - void *data); - -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e248d1c3969..7e1e798e7213 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2590,36 +2590,6 @@ done: } EXPORT_SYMBOL_GPL(writeback_iter); -/** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * Return: %0 on success, negative error code otherwise - * - * Note: please use writeback_iter() instead. - */ -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) -{ - struct folio *folio = NULL; - int error; - - while ((folio = writeback_iter(mapping, wbc, folio, &error))) { - error = writepage(folio, wbc, data); - if (error == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - error = 0; - } - } - - return error; -} -EXPORT_SYMBOL(write_cache_pages); - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; -- cgit v1.2.3 From ef49b7b39d50b9e4f9d63e64f5d8acafe3c71158 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Aug 2025 15:13:44 +0000 Subject: maple_tree: fix MAPLE_PARENT_RANGE32 and parent pointer docs MAPLE_PARENT_RANGE32 should be 0x02 as a 32 bit node is indicated by the bit pattern 0b010 which is the hex value 0x02. There are no users currently, so there is no associated bug with this wrong value. Fix typo Note -> Node and replace x with b to indicate binary values. Link: https://lkml.kernel.org/r/20250826151344.403286-1-sidhartha.kumar@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 16 ++++++++-------- lib/maple_tree.c | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index bafe143b1f78..41e633264e51 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -57,17 +57,17 @@ * MT_FLAGS_ALLOC_RANGE flag. * * Node types: - * 0x??1 = Root - * 0x?00 = 16 bit nodes - * 0x010 = 32 bit nodes - * 0x110 = 64 bit nodes + * 0b??1 = Root + * 0b?00 = 16 bit nodes + * 0b010 = 32 bit nodes + * 0b110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location - * 0x??1 : Root - * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 - * 0x010 : 32 bit values, type in 0-2, slot in 3-6 - * 0x110 : 64 bit values, type in 0-2, slot in 3-6 + * 0b??1 : Root + * 0b?00 : 16 bit values, type in 0-1, slot in 2-6 + * 0b010 : 32 bit values, type in 0-2, slot in 3-6 + * 0b110 : 64 bit values, type in 0-2, slot in 3-6 */ /* diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b4ee2d29d7a9..c57a4615bdff 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -405,11 +405,11 @@ static __always_inline bool mt_is_alloc(struct maple_tree *mt) * a reuse of the last bit in the node type. This is possible by using bit 1 to * indicate if bit 2 is part of the type or the slot. * - * Note types: - * 0x??1 = Root - * 0x?00 = 16 bit nodes - * 0x010 = 32 bit nodes - * 0x110 = 64 bit nodes + * Node types: + * 0b??1 = Root + * 0b?00 = 16 bit nodes + * 0b010 = 32 bit nodes + * 0b110 = 64 bit nodes * * Slot size and alignment * 0b??1 : Root @@ -427,7 +427,7 @@ static __always_inline bool mt_is_alloc(struct maple_tree *mt) #define MAPLE_PARENT_16B_SLOT_MASK 0xFC #define MAPLE_PARENT_RANGE64 0x06 -#define MAPLE_PARENT_RANGE32 0x04 +#define MAPLE_PARENT_RANGE32 0x02 #define MAPLE_PARENT_NOT_RANGE16 0x02 /* -- cgit v1.2.3 From cf1dec76ba8a00b20e51d205f3c9f5c45bc96df2 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Aug 2025 14:55:35 -0700 Subject: mm/filemap: add AS_KERNEL_FILE Patch series "introduce kernel file mapped folios", v4. Btrfs currently tracks its metadata pages in the page cache, using a fake inode (fs_info->btree_inode) with offsets corresponding to where the metadata is stored in the filesystem's full logical address space. A consequence of this is that when btrfs uses filemap_add_folio(), this usage is charged to the cgroup of whichever task happens to be running at the time. These folios don't belong to any particular user cgroup, so I don't think it makes much sense for them to be charged in that way. Some negative consequences as a result: - A task can be holding some important btrfs locks, then need to lookup some metadata and go into reclaim, extending the duration it holds that lock for, and unfairly pushing its own reclaim pain onto other cgroups. - If that cgroup goes into reclaim, it might reclaim these folios a different non-reclaiming cgroup might need soon. This is naturally offset by LRU reclaim, but still. We have two options for how to manage such file pages: 1. charge them to the root cgroup. 2. don't charge them to any cgroup at all. 2. breaks the invariant that every mapped page has a cgroup. This is workable, but unnecessarily risky. Therefore, go with 1. A very similar proposal to use the root cgroup was previously made by Qu, where he eventually proposed the idea of setting it per address_space. This makes good sense for the btrfs use case, as the behavior should apply to all use of the address_space, not select allocations. I.e., if someone adds another filemap_add_folio() call using btrfs's btree_inode, we would almost certainly want to account that to the root cgroup as well. This patch (of 3): Add the flag AS_KERNEL_FILE to the address_space to indicate that this mapping's memory is exempt from the usual memcg accounting. [boris@bur.io: fix CONFIG_MEMCG build for AS_KERNEL_FILE] Link: https://lkml.kernel.org/r/6de59ddeec81b5c294d337c001ba0061631d4ec6.1755816635.git.boris@bur.io Link: https://lore.kernel.org/linux-mm/b5fef5372ae454a7b6da4f2f75c427aeab6a07d6.1727498749.git.wqu@suse.com/ Link: https://lkml.kernel.org/r/f09c4e2c90351d4cb30a1969f7a863b9238bd291.1755812945.git.boris@bur.io Signed-off-by: Boris Burkov Suggested-by: Qu Wenruo Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 ++ include/linux/pagemap.h | 2 ++ mm/filemap.c | 6 ++++++ 3 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9fa3afc90dd5..e693978b2022 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1059,6 +1059,8 @@ extern int mem_cgroup_init(void); #define MEM_CGROUP_ID_SHIFT 0 +#define root_mem_cgroup (NULL) + static inline struct mem_cgroup *folio_memcg(struct folio *folio) { return NULL; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 12a12dae727d..f0dfdfb13cd9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -211,6 +211,8 @@ enum mapping_flags { folio contents */ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, + AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't + account usage to user cgroups */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, diff --git a/mm/filemap.c b/mm/filemap.c index 6e954156bb77..92ea20356f22 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -960,8 +960,14 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, { void *shadow = NULL; int ret; + struct mem_cgroup *tmp; + bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags); + if (kernel_file) + tmp = set_active_memcg(root_mem_cgroup); ret = mem_cgroup_charge(folio, NULL, gfp); + if (kernel_file) + set_active_memcg(tmp); if (ret) return ret; -- cgit v1.2.3 From e3a9ac4e866ea746475b4026819a8c08ec1142e6 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Aug 2025 14:55:36 -0700 Subject: mm: add vmstat for kernel_file pages Kernel file pages are tricky to track because they are indistinguishable from files whose usage is accounted to the root cgroup. To maintain good accounting, introduce a vmstat counter tracking kernel file pages. Confirmed that these work as expected at a high level by mounting a btrfs using AS_KERNEL_FILE for metadata pages, and seeing the counter rise with fs usage then go back to a minimal level after drop_caches and finally down to 0 after unmounting the fs. Link: https://lkml.kernel.org/r/08ff633e3a005ed5f7691bfd9f58a5df8e474339.1755812945.git.boris@bur.io Signed-off-by: Boris Burkov Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Tested-by: syzbot@syzkaller.appspotmail.com Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qu Wenruo Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + mm/filemap.c | 7 +++++++ mm/vmstat.c | 1 + 3 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fe13ad175fed..f3272ef5131b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -259,6 +259,7 @@ enum node_stat_item { NR_HUGETLB, #endif NR_BALLOON_PAGES, + NR_KERNEL_FILE_PAGES, NR_VM_NODE_STAT_ITEMS }; diff --git a/mm/filemap.c b/mm/filemap.c index 92ea20356f22..cd9387b0a5b5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -190,6 +190,9 @@ static void filemap_unaccount_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } + if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, -nr); /* * At this point folio must be either written or cleaned by @@ -989,6 +992,10 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); + if (kernel_file) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, + folio_nr_pages(folio)); } return ret; } diff --git a/mm/vmstat.c b/mm/vmstat.c index e74f0b2a1021..e522decf6a72 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1290,6 +1290,7 @@ const char * const vmstat_text[] = { [I(NR_HUGETLB)] = "nr_hugetlb", #endif [I(NR_BALLOON_PAGES)] = "nr_balloon_pages", + [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages", #undef I /* system-wide enum vm_stat_item counters */ -- cgit v1.2.3 From 98c94f1035fc0c82ab008854a165df2c20c0cb6a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 27 Aug 2025 07:01:05 +0000 Subject: mm/pageblock-flags: remove PB_migratetype_bits/PB_migrate_end enum pageblock_bits defines the meaning of pageblock bits. Currently PB_migratetype_bits says the lowest 3 bits represents migratetype and PB_migrate_end/MIGRATETYPE_MASK's definition rely on it with magical computation. Remove the definition of PB_migratetype_bits/PB_migrate_end. Use PB_migrate_[0|1|2] to represent lowest bits for migratetype. Then we can simplify related definition. Also, MIGRATETYPE_AND_ISO_MASK is MIGRATETYPE_MASK add isolation bit. Use MIGRATETYPE_MASK in the definition of MIGRATETYPE_AND_ISO_MASK looks cleaner. No functional change intended. Link: https://lkml.kernel.org/r/20250827070105.16864-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Zi Yan Cc: Vlastimil Babka Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 12 +++++------- mm/page_alloc.c | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 6a44be0f39f4..e046278a01fa 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -13,12 +13,11 @@ #include -#define PB_migratetype_bits 3 /* Bit indices that affect a whole block of pages */ enum pageblock_bits { - PB_migrate, - PB_migrate_end = PB_migrate + PB_migratetype_bits - 1, - /* 3 bits required for migrate types */ + PB_migrate_0, + PB_migrate_1, + PB_migrate_2, PB_compact_skip,/* If set the block is skipped by compaction */ #ifdef CONFIG_MEMORY_ISOLATION @@ -37,11 +36,10 @@ enum pageblock_bits { #define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS)) -#define MIGRATETYPE_MASK ((1UL << (PB_migrate_end + 1)) - 1) +#define MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2)) #ifdef CONFIG_MEMORY_ISOLATION -#define MIGRATETYPE_AND_ISO_MASK \ - (((1UL << (PB_migrate_end + 1)) - 1) | BIT(PB_migrate_isolate)) +#define MIGRATETYPE_AND_ISO_MASK (MIGRATETYPE_MASK | BIT(PB_migrate_isolate)) #else #define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c524b80a3b72..2df6ee6998ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -355,7 +355,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit) { - return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS; + return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS; } static __always_inline void @@ -370,7 +370,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn, #else BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); #endif - BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits)); + BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK); VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); bitmap = get_pageblock_bitmap(page, pfn); -- cgit v1.2.3 From 09a616cbb371e6b843e536f00e38d6b43d796ac4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:32 -0700 Subject: mm/damon/core: add damon_ctx->addr_unit Patch series "mm/damon: support ARM32 with LPAE", v3. Previously, DAMON's physical address space monitoring only supported memory ranges below 4GB on LPAE-enabled systems. This was due to the use of 'unsigned long' in 'struct damon_addr_range', which is 32-bit on ARM32 even with LPAE enabled[1]. To add DAMON support for ARM32 with LPAE enabled, a new core layer parameter called 'addr_unit' was introduced[2]. Operations set layer can translate a core layer address to the real address by multiplying the parameter value to the core layer address. Support of the parameter is up to each operations layer implementation, though. For example, operations set implementations for virtual address space can simply ignore the parameter. Add the support on paddr, which is the DAMON operations set implementation for the physical address space, as we have a clear use case for that. This patch (of 11): In some cases, some of the real address that handled by the underlying operations set cannot be handled by DAMON since it uses only 'unsinged long' as the address type. Using DAMON for physical address space monitoring of 32 bit ARM devices with large physical address extension (LPAE) is one example[1]. Add a parameter name 'addr_unit' to core layer to help such cases. DAMON core API callers can set it as the scale factor that will be used by the operations set for translating the core layer's addresses to the real address by multiplying the parameter value to the core layer address. Support of the parameter is up to each operations set layer. The support from the physical address space operations set (paddr) will be added with following commits. Link: https://lkml.kernel.org/r/20250828171242.59810-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250828171242.59810-2-sj@kernel.org Link: https://lore.kernel.org/20250408075553.959388-1-zuoze1@huawei.com [1] Link: https://lore.kernel.org/all/20250416042551.158131-1-sj@kernel.org/ [2] Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- mm/damon/core.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index d01bfee80bd6..6fa52f7495d9 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -746,7 +746,7 @@ struct damon_attrs { * Accesses to other fields must be protected by themselves. * * @ops: Set of monitoring operations for given use cases. - * + * @addr_unit: Scale factor for core to ops address conversion. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -788,6 +788,7 @@ struct damon_ctx { struct mutex kdamond_lock; struct damon_operations ops; + unsigned long addr_unit; struct list_head adaptive_targets; struct list_head schemes; diff --git a/mm/damon/core.c b/mm/damon/core.c index 52ecc3a4426f..e098389a2f73 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -544,6 +544,8 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; + ctx->addr_unit = 1; + INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -1245,6 +1247,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) return err; } dst->ops = src->ops; + dst->addr_unit = src->addr_unit; return 0; } -- cgit v1.2.3 From d8f867fa0825fb3e358457566d7326d8aab2406a Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Thu, 28 Aug 2025 10:12:42 -0700 Subject: mm/damon: add damon_ctx->min_sz_region Adopting addr_unit would make DAMON_MINREGION 'addr_unit * 4096' bytes and cause data alignment issues[1]. Add damon_ctx->min_sz_region to change DAMON_MIN_REGION from a global macro value to per-context variable. Link: https://lkml.kernel.org/r/20250828171242.59810-12-sj@kernel.org Link: https://lore.kernel.org/all/527714dd-0e33-43ab-bbbd-d89670ba79e7@huawei.com [1] Signed-off-by: Quanmin Yan Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++- mm/damon/core.c | 67 +++++++++++++++++++++++++------------------- mm/damon/sysfs.c | 8 ++++-- mm/damon/tests/core-kunit.h | 21 ++++++++------ mm/damon/tests/vaddr-kunit.h | 2 +- mm/damon/vaddr.c | 2 +- 6 files changed, 61 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 6fa52f7495d9..ec8716292c09 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -747,6 +747,7 @@ struct damon_attrs { * * @ops: Set of monitoring operations for given use cases. * @addr_unit: Scale factor for core to ops address conversion. + * @min_sz_region: Minimum region size. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -789,6 +790,7 @@ struct damon_ctx { struct damon_operations ops; unsigned long addr_unit; + unsigned long min_sz_region; struct list_head adaptive_targets; struct list_head schemes; @@ -877,7 +879,7 @@ static inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges); + unsigned int nr_ranges, unsigned long min_sz_region); void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); diff --git a/mm/damon/core.c b/mm/damon/core.c index e098389a2f73..12f136a121a2 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -201,6 +201,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * @t: the given target. * @ranges: array of new monitoring target ranges. * @nr_ranges: length of @ranges. + * @min_sz_region: minimum region size. * * This function adds new regions to, or modify existing regions of a * monitoring target to fit in specific ranges. @@ -208,7 +209,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * Return: 0 if success, or negative error code otherwise. */ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges) + unsigned int nr_ranges, unsigned long min_sz_region) { struct damon_region *r, *next; unsigned int i; @@ -245,16 +246,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, /* no region intersects with this range */ newr = damon_new_region( ALIGN_DOWN(range->start, - DAMON_MIN_REGION), - ALIGN(range->end, DAMON_MIN_REGION)); + min_sz_region), + ALIGN(range->end, min_sz_region)); if (!newr) return -ENOMEM; damon_insert_region(newr, damon_prev_region(r), r, t); } else { /* resize intersecting regions to fit in this range */ first->ar.start = ALIGN_DOWN(range->start, - DAMON_MIN_REGION); - last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + min_sz_region); + last->ar.end = ALIGN(range->end, min_sz_region); /* fill possible holes in the range */ err = damon_fill_regions_holes(first, last, t); @@ -545,6 +546,7 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.max_nr_regions = 1000; ctx->addr_unit = 1; + ctx->min_sz_region = DAMON_MIN_REGION; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -1127,8 +1129,8 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx) * * If @src has no region, @dst keeps current regions. */ -static int damon_commit_target_regions( - struct damon_target *dst, struct damon_target *src) +static int damon_commit_target_regions(struct damon_target *dst, + struct damon_target *src, unsigned long src_min_sz_region) { struct damon_region *src_region; struct damon_addr_range *ranges; @@ -1145,18 +1147,19 @@ static int damon_commit_target_regions( i = 0; damon_for_each_region(src_region, src) ranges[i++] = src_region->ar; - err = damon_set_regions(dst, ranges, i); + err = damon_set_regions(dst, ranges, i, src_min_sz_region); kfree(ranges); return err; } static int damon_commit_target( struct damon_target *dst, bool dst_has_pid, - struct damon_target *src, bool src_has_pid) + struct damon_target *src, bool src_has_pid, + unsigned long src_min_sz_region) { int err; - err = damon_commit_target_regions(dst, src); + err = damon_commit_target_regions(dst, src, src_min_sz_region); if (err) return err; if (dst_has_pid) @@ -1178,7 +1181,8 @@ static int damon_commit_targets( if (src_target) { err = damon_commit_target( dst_target, damon_target_has_pid(dst), - src_target, damon_target_has_pid(src)); + src_target, damon_target_has_pid(src), + src->min_sz_region); if (err) return err; } else { @@ -1201,7 +1205,8 @@ static int damon_commit_targets( if (!new_target) return -ENOMEM; err = damon_commit_target(new_target, false, - src_target, damon_target_has_pid(src)); + src_target, damon_target_has_pid(src), + src->min_sz_region); if (err) { damon_destroy_target(new_target, NULL); return err; @@ -1248,6 +1253,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) } dst->ops = src->ops; dst->addr_unit = src->addr_unit; + dst->min_sz_region = src->min_sz_region; return 0; } @@ -1280,8 +1286,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) if (ctx->attrs.min_nr_regions) sz /= ctx->attrs.min_nr_regions; - if (sz < DAMON_MIN_REGION) - sz = DAMON_MIN_REGION; + if (sz < ctx->min_sz_region) + sz = ctx->min_sz_region; return sz; } @@ -1625,6 +1631,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * @t: The target of the region. * @rp: The pointer to the region. * @s: The scheme to be applied. + * @min_sz_region: minimum region size. * * If a quota of a scheme has exceeded in a quota charge window, the scheme's * action would applied to only a part of the target access pattern fulfilling @@ -1642,7 +1649,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * Return: true if the region should be entirely skipped, false otherwise. */ static bool damos_skip_charged_region(struct damon_target *t, - struct damon_region **rp, struct damos *s) + struct damon_region **rp, struct damos *s, unsigned long min_sz_region) { struct damon_region *r = *rp; struct damos_quota *quota = &s->quota; @@ -1664,11 +1671,11 @@ static bool damos_skip_charged_region(struct damon_target *t, if (quota->charge_addr_from && r->ar.start < quota->charge_addr_from) { sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, DAMON_MIN_REGION); + r->ar.start, min_sz_region); if (!sz_to_skip) { - if (damon_sz_region(r) <= DAMON_MIN_REGION) + if (damon_sz_region(r) <= min_sz_region) return true; - sz_to_skip = DAMON_MIN_REGION; + sz_to_skip = min_sz_region; } damon_split_region_at(t, r, sz_to_skip); r = damon_next_region(r); @@ -1693,7 +1700,8 @@ static void damos_update_stat(struct damos *s, } static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos_filter *filter) + struct damon_region *r, struct damos_filter *filter, + unsigned long min_sz_region) { bool matched = false; struct damon_target *ti; @@ -1710,8 +1718,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, matched = target_idx == filter->target_idx; break; case DAMOS_FILTER_TYPE_ADDR: - start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION); - end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION); + start = ALIGN_DOWN(filter->addr_range.start, min_sz_region); + end = ALIGN_DOWN(filter->addr_range.end, min_sz_region); /* inside the range */ if (start <= r->ar.start && r->ar.end <= end) { @@ -1747,7 +1755,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter)) { + if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) { if (filter->allow) s->core_filters_allowed = true; return !filter->allow; @@ -1882,7 +1890,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - DAMON_MIN_REGION); + c->min_sz_region); if (!sz) goto update_stat; damon_split_region_at(t, r, sz); @@ -1930,7 +1938,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - if (damos_skip_charged_region(t, &r, s)) + if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) continue; if (!damos_valid_target(c, t, r, s)) @@ -2324,7 +2332,8 @@ static void damon_split_region_at(struct damon_target *t, } /* Split every region in the given target into 'nr_subs' regions */ -static void damon_split_regions_of(struct damon_target *t, int nr_subs) +static void damon_split_regions_of(struct damon_target *t, int nr_subs, + unsigned long min_sz_region) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; @@ -2334,13 +2343,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs) sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && - sz_region > 2 * DAMON_MIN_REGION; i++) { + sz_region > 2 * min_sz_region; i++) { /* * Randomly select size of left sub-region to be at * least 10 percent and at most 90% of original region */ sz_sub = ALIGN_DOWN(damon_rand(1, 10) * - sz_region / 10, DAMON_MIN_REGION); + sz_region / 10, min_sz_region); /* Do not allow blank region */ if (sz_sub == 0 || sz_sub >= sz_region) continue; @@ -2380,7 +2389,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(t, nr_subregions); + damon_split_regions_of(t, nr_subregions, ctx->min_sz_region); last_nr_regions = nr_regions; } @@ -2769,7 +2778,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1); + return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION); } /* diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 98bf15d403b2..0ed404c89f80 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1329,7 +1329,8 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, } static int damon_sysfs_set_regions(struct damon_target *t, - struct damon_sysfs_regions *sysfs_regions) + struct damon_sysfs_regions *sysfs_regions, + unsigned long min_sz_region) { struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr, sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN); @@ -1351,7 +1352,7 @@ static int damon_sysfs_set_regions(struct damon_target *t, if (ranges[i - 1].end > ranges[i].start) goto out; } - err = damon_set_regions(t, ranges, sysfs_regions->nr); + err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region); out: kfree(ranges); return err; @@ -1372,7 +1373,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, /* caller will destroy targets */ return -EINVAL; } - return damon_sysfs_set_regions(t, sys_target->regions); + return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region); } static int damon_sysfs_add_targets(struct damon_ctx *ctx, @@ -1430,6 +1431,7 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, if (err) return err; ctx->addr_unit = sys_ctx->addr_unit; + ctx->min_sz_region = max(DAMON_MIN_REGION / sys_ctx->addr_unit, 1); err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 5f5dc9db2e90..51369e35298b 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -230,14 +230,14 @@ static void damon_test_split_regions_of(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); - damon_split_regions_of(t, 2); + damon_split_regions_of(t, 2, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); - damon_split_regions_of(t, 4); + damon_split_regions_of(t, 4, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); damon_destroy_ctx(c); @@ -303,7 +303,7 @@ static void damon_test_set_regions(struct kunit *test) damon_add_region(r1, t); damon_add_region(r2, t); - damon_set_regions(t, &range, 1); + damon_set_regions(t, &range, 1, DAMON_MIN_REGION); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); damon_for_each_region(r, t) { @@ -450,25 +450,29 @@ static void damos_test_filter_out(struct kunit *test) damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 2; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ r->ar.start = DAMON_MIN_REGION * 6; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); @@ -481,7 +485,8 @@ static void damos_test_filter_out(struct kunit *test) /* region started in the range */ r->ar.start = DAMON_MIN_REGION * 2; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index d2b37ccf2cc0..fce38dd53cf8 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -141,7 +141,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, damon_add_region(r, t); } - damon_set_regions(t, three_regions, 3); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); for (i = 0; i < nr_expected / 2; i++) { r = __nth_region_of(t, i); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 66ef9869eafe..8c048f9b129e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { if (damon_va_three_regions(t, three_regions)) continue; - damon_set_regions(t, three_regions, 3); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); } } -- cgit v1.2.3 From 1e332f303ae93ba4d38b480b1bb5a08f833306f6 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 28 Aug 2025 15:03:11 +0200 Subject: pagevec.h: add `const` to pointer parameters of getter functions For improved const-correctness. Link: https://lkml.kernel.org/r/20250828130311.772993-1-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: SeongJae Park Reviewed-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 5d3a0cccc6bf..63be5a451627 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -51,12 +51,12 @@ static inline void folio_batch_reinit(struct folio_batch *fbatch) fbatch->i = 0; } -static inline unsigned int folio_batch_count(struct folio_batch *fbatch) +static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) { return fbatch->nr; } -static inline unsigned int folio_batch_space(struct folio_batch *fbatch) +static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) { return PAGEVEC_SIZE - fbatch->nr; } -- cgit v1.2.3 From 39b44c8c73312ac535ffdf7c8ecd37ea07d4ef86 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 28 Aug 2025 10:48:20 +0200 Subject: huge_mm.h: disallow is_huge_zero_folio(NULL) Calling is_huge_zero_folio(NULL) should not be legal - it makes no sense, and a different (theoretical) implementation may dereference the pointer. But currently, lacking any explicit documentation, this call is possible. But if somebody really passes NULL, the function should not return true - this isn't the huge zero folio after all! However, if the `huge_zero_folio` hasn't been allocated yet, it's NULL, and is_huge_zero_folio(NULL) just happens to return true, which is a lie. This weird side effect prevented me from reproducing a kernel crash that occurred when the elements of a folio_batch were NULL - since folios_put_refs() skips huge zero folios, this sometimes causes a crash, but sometimes does not. For debugging, it is better to reveal such bugs reliably and not hide them behind random preconditions like "has the huge zero folio already been created?" To improve detection of such bugs, David Hildenbrand suggested adding a VM_WARN_ON_ONCE(). Link: https://lkml.kernel.org/r/20250828084820.570118-1-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Dev Jain Cc: Kairui Song Cc: Kemeng Shi Cc: Liam Howlett Cc: Mariano Pache Cc: Nhat Pham Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1ac0d06fb3c1..29ef70022da1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -501,6 +501,8 @@ extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_folio(const struct folio *folio) { + VM_WARN_ON_ONCE(!folio); + return READ_ONCE(huge_zero_folio) == folio; } -- cgit v1.2.3 From f367474b5884edbc42661e7fecf784cb131dd25d Mon Sep 17 00:00:00 2001 From: Brian Mak Date: Tue, 5 Aug 2025 14:15:27 -0700 Subject: x86/kexec: carry forward the boot DTB on kexec Currently, the kexec_file_load syscall on x86 does not support passing a device tree blob to the new kernel. Some embedded x86 systems use device trees. On these systems, failing to pass a device tree to the new kernel causes a boot failure. To add support for this, we copy the behavior of ARM64 and PowerPC and copy the current boot's device tree blob for use in the new kernel. We do this on x86 by passing the device tree blob as a setup_data entry in accordance with the x86 boot protocol. This behavior is gated behind the KEXEC_FILE_FORCE_DTB flag. Link: https://lkml.kernel.org/r/20250805211527.122367-3-makb@juniper.net Signed-off-by: Brian Mak Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Betkov Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Rob Herring Cc: Saravana Kannan Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- arch/x86/kernel/kexec-bzimage64.c | 47 ++++++++++++++++++++++++++++++++++++--- include/linux/kexec.h | 5 ++++- include/uapi/linux/kexec.h | 4 ++++ kernel/kexec_file.c | 1 + 4 files changed, 53 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 24a41f0e0cf1..c3244ac680d1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -212,6 +214,28 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +#ifdef CONFIG_OF_FLATTREE +static void setup_dtb(struct boot_params *params, + unsigned long params_load_addr, + unsigned int dtb_setup_data_offset) +{ + struct setup_data *sd = (void *)params + dtb_setup_data_offset; + unsigned long setup_data_phys, dtb_len; + + dtb_len = fdt_totalsize(initial_boot_params); + sd->type = SETUP_DTB; + sd->len = dtb_len; + + /* Carry over current boot DTB with setup_data */ + memcpy(sd->data, initial_boot_params, dtb_len); + + /* Add setup data */ + setup_data_phys = params_load_addr + dtb_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} +#endif /* CONFIG_OF_FLATTREE */ + static void setup_ima_state(const struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -336,6 +360,17 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct efi_setup_data); #endif +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) { + setup_dtb(params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); + } else { + pr_debug("Not carrying over DTB, force_dtb = %d\n", + image->force_dtb); + } +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { /* Setup IMA log buffer state */ setup_ima_state(image, params, params_load_addr, @@ -529,6 +564,12 @@ static void *bzImage64_load(struct kimage *image, char *kernel, sizeof(struct setup_data) + RNG_SEED_LENGTH; +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) + kbuf.bufsz += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); @@ -537,7 +578,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct kho_data); - params = kzalloc(kbuf.bufsz, GFP_KERNEL); + params = kvzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -647,7 +688,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ldata; out_free_params: - kfree(params); + kvfree(params); return ERR_PTR(ret); } @@ -659,7 +700,7 @@ static int bzImage64_cleanup(void *loader_data) if (!ldata) return 0; - kfree(ldata->bootparams_buf); + kvfree(ldata->bootparams_buf); ldata->bootparams_buf = NULL; return 0; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 39fe3e6cd282..ff7e231b0485 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -395,6 +395,9 @@ struct kimage { /* Information for loading purgatory */ struct purgatory_info purgatory_info; + + /* Force carrying over the DTB from the current boot */ + bool force_dtb; #endif #ifdef CONFIG_CRASH_HOTPLUG @@ -461,7 +464,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ - KEXEC_FILE_NO_CMA) + KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 8958ebfcff94..55749cb0b81d 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -22,12 +22,16 @@ * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd * fd field. + * KEXEC_FILE_FORCE_DTB : Force carrying over the current boot's DTB to the new + * kernel on x86. This is already the default behavior on + * some other architectures, like ARM64 and PowerPC. */ #define KEXEC_FILE_UNLOAD 0x00000001 #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 #define KEXEC_FILE_NO_CMA 0x00000010 +#define KEXEC_FILE_FORCE_DTB 0x00000020 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 91d46502a817..eb62a9794242 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); -- cgit v1.2.3 From c8a09fc9664f79eeb66cdf4a2a34d5b6a239b727 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 14 Jul 2025 10:17:09 +0200 Subject: ida: remove the ida_simple_xxx() API All users of the ida_simple_xxx() have been converted. In Linux 6.11-rc2, the only callers are in tools/testing/. So it is now time to remove the definition of this old and deprecated ida_simple_get() and ida_simple_remove(). Link: https://lkml.kernel.org/r/aa205f45fef70a9c948b6a98bad06da58e4de776.1752480043.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/idr.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index 2267902d29a7..789e23e67444 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -334,14 +334,6 @@ static inline void ida_init(struct ida *ida) xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } -/* - * ida_simple_get() and ida_simple_remove() are deprecated. Use - * ida_alloc() and ida_free() instead respectively. - */ -#define ida_simple_get(ida, start, end, gfp) \ - ida_alloc_range(ida, start, (end) - 1, gfp) -#define ida_simple_remove(ida, id) ida_free(ida, id) - static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); -- cgit v1.2.3 From baa96bcb180e979a821ce3ade87a3d2349b2d640 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 14 Jul 2025 10:17:10 +0200 Subject: nvmem: update a comment related to struct nvmem_config Update a comment to match the function used in nvmem_register(). ida_simple_get() was replaced by ida_alloc() in commit 1eb51d6a4fce ("nvmem: switch to simpler IDA interface") Link: https://lkml.kernel.org/r/27a9dec93a9f79140b11a77df38b1b45bd342e09.1752480043.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/nvmem-provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 615a560d9edb..f3b13da78aac 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -103,7 +103,7 @@ struct nvmem_cell_info { * * Note: A default "nvmem" name will be assigned to the device if * no name is specified in its configuration. In such case "" is - * generated with ida_simple_get() and provided id field is ignored. + * generated with ida_alloc() and provided id field is ignored. * * Note: Specifying name and setting id to -1 implies a unique device * whose name is provided as-is (kept unaltered). -- cgit v1.2.3 From 2a8c51bc9391ff3c701f06ae1b678419f843dc1a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 19 Aug 2025 00:55:07 -0700 Subject: list.h: add missing kernel-doc for basic macros kernel-doc for the basic LIST_HEAD() and LIST_HEAD_INIT() macros has been missing forever (i.e., since git). Add them for completeness. Link: https://lkml.kernel.org/r/20250819075507.113639-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Nicolas Frattaroli Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/list.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index e7e28afd28f8..ca63bdea6c1a 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -20,8 +20,16 @@ * using the generic single-entry routines. */ +/** + * LIST_HEAD_INIT - initialize a &struct list_head's links to point to itself + * @name: name of the list_head + */ #define LIST_HEAD_INIT(name) { &(name), &(name) } +/** + * LIST_HEAD - definition of a &struct list_head with initialization values + * @name: name of the list_head + */ #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) -- cgit v1.2.3 From 2683df6539cbc3f0eeeba11154bc0cbf042a5cee Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 25 Aug 2025 10:57:00 +0800 Subject: panic: add note that 'panic_print' parameter is deprecated Just like for 'panic_print's systcl interface, add similar note for setup of kernel cmdline parameter and parameter under /sys/module/kernel/. Also add __core_param_cb() macro, which enables to add special get/set operation for a kernel parameter. Link: https://lkml.kernel.org/r/20250825025701.81921-4-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Askar Safin Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- include/linux/moduleparam.h | 13 +++++++++++++ kernel/panic.c | 19 ++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 3a25122d83e2..6907aedc4f74 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -349,6 +349,19 @@ static inline void kernel_param_unlock(struct module *mod) __module_param_call("", name, ¶m_ops_##type, &var, perm, \ -1, KERNEL_PARAM_FL_UNSAFE) +/** + * __core_param_cb - similar like core_param, with a set/get ops instead of type. + * @name: the name of the cmdline and sysfs parameter (often the same as var) + * @var: the variable + * @ops: the set & get operations for this parameter. + * @perm: visibility in sysfs + * + * Ideally this should be called 'core_param_cb', but the name has been + * used for module core parameter, so add the '__' prefix + */ +#define __core_param_cb(name, ops, arg, perm) \ + __module_param_call("", name, ops, arg, perm, -1, 0) + #endif /* !MODULE */ /** diff --git a/kernel/panic.c b/kernel/panic.c index 72fcbb5a071b..12a10e17ab4a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -937,12 +937,29 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif core_param(panic, panic_timeout, int, 0644); -core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); core_param(panic_console_replay, panic_console_replay, bool, 0644); +static int panic_print_set(const char *val, const struct kernel_param *kp) +{ + pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return param_set_ulong(val, kp); +} + +static int panic_print_get(char *val, const struct kernel_param *kp) +{ + pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return param_get_ulong(val, kp); +} + +static const struct kernel_param_ops panic_print_ops = { + .set = panic_print_set, + .get = panic_print_get, +}; +__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644); + static int __init oops_setup(char *s) { if (!s) -- cgit v1.2.3 From d0d9c7235548f1d772f1e48c9d5742c65d81c705 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:29 +0800 Subject: panic: introduce helper functions for panic state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "panic: introduce panic status function family", v2. This series introduces a family of helper functions to manage panic state and updates existing code to use them. Before this series, panic state helpers were scattered and inconsistent. For example, panic_in_progress() was defined in printk/printk.c, not in panic.c or panic.h. As a result, developers had to look in unexpected places to understand or re-use panic state logic. Other checks were open- coded, duplicating logic across panic, crash, and watchdog paths. The new helpers centralize the functionality in panic.c/panic.h: - panic_try_start() - panic_reset() - panic_in_progress() - panic_on_this_cpu() - panic_on_other_cpu() Patches 1–8 add the helpers and convert panic/crash and printk/nbcon code to use them. Patch 9 fixes a bug in the watchdog subsystem by skipping checks when a panic is in progress, avoiding interference with the panic CPU. Together, this makes panic state handling simpler, more discoverable, and more robust. This patch (of 9): This patch introduces four new helper functions to abstract the management of the panic_cpu variable. These functions will be used in subsequent patches to refactor existing code. The direct use of panic_cpu can be error-prone and ambiguous, as it requires manual checks to determine which CPU is handling the panic. The new helpers clarify intent: panic_try_start(): Atomically sets the current CPU as the panicking CPU. panic_reset(): Reset panic_cpu to PANIC_CPU_INVALID. panic_in_progress(): Checks if a panic has been triggered. panic_on_this_cpu(): Returns true if the current CPU is the panic originator. panic_on_other_cpu(): Returns true if a panic is on another CPU. This change lays the groundwork for improved code readability and robustness in the panic handling subsystem. Link: https://lkml.kernel.org/r/20250825022947.1596226-1-wangjinchao600@gmail.com Link: https://lkml.kernel.org/r/20250825022947.1596226-2-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) b Signed-off-by: Andrew Morton --- include/linux/panic.h | 6 ++++++ kernel/panic.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 5 ----- 3 files changed, 59 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/panic.h b/include/linux/panic.h index 7be742628c25..6f972a66c13e 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -43,6 +43,12 @@ void abort(void); extern atomic_t panic_cpu; #define PANIC_CPU_INVALID -1 +bool panic_try_start(void); +void panic_reset(void); +bool panic_in_progress(void); +bool panic_on_this_cpu(void); +bool panic_on_other_cpu(void); + /* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. diff --git a/kernel/panic.c b/kernel/panic.c index 24bca263f896..010a1bfc4843 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -299,6 +299,59 @@ void __weak crash_smp_send_stop(void) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +bool panic_try_start(void) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); + + return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu); +} +EXPORT_SYMBOL(panic_try_start); + +void panic_reset(void) +{ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_reset); + +bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_in_progress); + +/* Return true if a panic is in progress on the current CPU. */ +bool panic_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); +} +EXPORT_SYMBOL(panic_on_this_cpu); + +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool panic_on_other_cpu(void) +{ + return (panic_in_progress() && !this_cpu_in_panic()); +} +EXPORT_SYMBOL(panic_on_other_cpu); + /* * A variant of panic() called from NMI context. We return if we've already * panicked on this CPU. If another CPU already panicked, loop in diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0efbcdda9aab..5fe35f377b79 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -345,11 +345,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -static bool panic_in_progress(void) -{ - return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); -} - /* Return true if a panic is in progress on the current CPU. */ bool this_cpu_in_panic(void) { -- cgit v1.2.3 From c6be36e2997662f423edfa3979a63935873ff648 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:35 +0800 Subject: panic/printk: replace this_cpu_in_panic() with panic_on_this_cpu() The helper this_cpu_in_panic() duplicated logic already provided by panic_on_this_cpu(). Remove this_cpu_in_panic() and switch all users to panic_on_this_cpu(). This simplifies the code and avoids having two helpers for the same check. Link: https://lkml.kernel.org/r/20250825022947.1596226-8-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- include/linux/printk.h | 2 -- kernel/panic.c | 2 +- kernel/printk/nbcon.c | 2 +- kernel/printk/printk.c | 15 ++------------- kernel/printk/printk_ringbuffer.c | 2 +- lib/dump_stack.c | 2 +- 6 files changed, 6 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 5d22b803f51e..45c663124c9b 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -330,8 +330,6 @@ static inline bool pr_flush(int timeout_ms, bool reset_on_progress) #endif -bool this_cpu_in_panic(void); - #ifdef CONFIG_SMP extern int __printk_cpu_sync_try_get(void); extern void __printk_cpu_sync_wait(void); diff --git a/kernel/panic.c b/kernel/panic.c index c4ef86fc643f..a8b1bf60e09f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -348,7 +348,7 @@ EXPORT_SYMBOL(panic_on_this_cpu); */ bool panic_on_other_cpu(void) { - return (panic_in_progress() && !this_cpu_in_panic()); + return (panic_in_progress() && !panic_on_this_cpu()); } EXPORT_SYMBOL(panic_on_other_cpu); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 7490865e2f44..c6d1a4a747e9 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -1394,7 +1394,7 @@ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; - if (this_cpu_in_panic()) + if (panic_on_this_cpu()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5fe35f377b79..faa8b1f0585b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -17,6 +17,7 @@ * 01Mar01 Andrew Morton */ +#include "linux/panic.h" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -345,18 +346,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -/* Return true if a panic is in progress on the current CPU. */ -bool this_cpu_in_panic(void) -{ - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); -} - /* * Return true if a panic is in progress on a remote CPU. * @@ -365,7 +354,7 @@ bool this_cpu_in_panic(void) */ bool other_cpu_in_panic(void) { - return (panic_in_progress() && !this_cpu_in_panic()); + return (panic_in_progress() && !panic_on_this_cpu()); } /* diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index d9fb053cff67..e2a1b2d34d2b 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2143,7 +2143,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && + if (panic_on_this_cpu() && (!debug_non_panic_cpus || legacy_allow_panic_sync) && ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; diff --git a/lib/dump_stack.c b/lib/dump_stack.c index b3a85fe8b673..f0c78b5b5324 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -102,7 +102,7 @@ static void __dump_stack(const char *log_lvl) */ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) { - bool in_panic = this_cpu_in_panic(); + bool in_panic = panic_on_this_cpu(); unsigned long flags; /* -- cgit v1.2.3 From 7b1e502eb17c23fa6459a19bfe5974bffdb95574 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 6 Sep 2025 21:38:57 -0700 Subject: kernel.h: add comments for enum system_states Provide some basic comments about the system_states and what they imply. Also convert the comments to kernel-doc format. Split the enum declaration from the definition of the system_state variable so that kernel-doc notation works cleanly with it. This is picked up by Documentation/driver-api/basics.rst so it does not need further inclusion in the kernel docbooks. Link: https://lkml.kernel.org/r/20250907043857.2941203-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Acked-by: Rafael J. Wysocki # v1 Reviewed-by: Mauro Carvalho Chehab [v5] Cc: "Brown, Len" Cc: Greg Kroah-Hartman Cc: James Bottomley Cc: Jani Nikula Cc: Jonathan Corbet Cc: Mauro Carvalho Chehab Cc: Pavel Machek Signed-off-by: Andrew Morton --- include/linux/kernel.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 989315dabb86..5b46924fdff5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -164,11 +164,23 @@ extern int root_mountflags; extern bool early_boot_irqs_disabled; -/* - * Values used for system_state. Ordering of the states must not be changed +/** + * enum system_states - Values used for system_state. + * + * @SYSTEM_BOOTING: %0, no init needed + * @SYSTEM_SCHEDULING: system is ready for scheduling; OK to use RCU + * @SYSTEM_FREEING_INITMEM: system is freeing all of initmem; almost running + * @SYSTEM_RUNNING: system is up and running + * @SYSTEM_HALT: system entered clean system halt state + * @SYSTEM_POWER_OFF: system entered shutdown/clean power off state + * @SYSTEM_RESTART: system entered emergency power off or normal restart + * @SYSTEM_SUSPEND: system entered suspend or hibernate state + * + * Note: + * Ordering of the states must not be changed * as code checks for <, <=, >, >= STATE. */ -extern enum system_states { +enum system_states { SYSTEM_BOOTING, SYSTEM_SCHEDULING, SYSTEM_FREEING_INITMEM, @@ -177,7 +189,8 @@ extern enum system_states { SYSTEM_POWER_OFF, SYSTEM_RESTART, SYSTEM_SUSPEND, -} system_state; +}; +extern enum system_states system_state; /* * General tracing related utility functions - trace_printk(), -- cgit v1.2.3 From d6d5116391857fc78fad9aa42317b36e4ce17b58 Mon Sep 17 00:00:00 2001 From: Evangelos Petrongonas Date: Thu, 21 Aug 2025 17:58:59 +0000 Subject: kexec: introduce is_kho_boot() Patch series "efi: Fix EFI boot with kexec handover (KHO)", v3. This patch series fixes a kernel panic that occurs when booting with both EFI and KHO (Kexec HandOver) enabled. The issue arises because EFI's `reserve_regions()` clears all memory regions with `memblock_remove(0, PHYS_ADDR_MAX)` before rebuilding them from EFI data. This destroys KHO scratch regions that were set up early during device tree scanning, causing a panic as the kernel has no valid memory regions for early allocations. The first patch introduces `is_kho_boot()` to allow early boot components to reliably detect if the kernel was booted via KHO-enabled kexec. The existing `kho_is_enabled()` only checks the command line and doesn't verify if an actual KHO FDT was passed. The second patch modifies EFI's `reserve_regions()` to selectively remove only non-KHO memory regions when KHO is active, preserving the critical scratch regions while still allowing EFI to rebuild its memory map. This patch (of 3): During early initialisation, after a kexec, other components, like EFI need to know if a KHO enabled kexec is performed. The `kho_is_enabled` function is not enough as in the early stages, it only reflects whether the cmdline has KHO enabled, not if an actual KHO FDT exists. Extend the KHO API with `is_kho_boot()` to provide a way for components to check if a KHO enabled kexec is performed. Link: https://lkml.kernel.org/r/cover.1755721529.git.epetron@amazon.de Link: https://lkml.kernel.org/r/7dc6674a76bf6e68cca0222ccff32427699cc02e.1755721529.git.epetron@amazon.de Signed-off-by: Evangelos Petrongonas Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Ard Biesheuvel Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 6 ++++++ kernel/kexec_handover.c | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 348844cffb13..559d13a3bc44 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -40,6 +40,7 @@ struct kho_serialization; #ifdef CONFIG_KEXEC_HANDOVER bool kho_is_enabled(void); +bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); int kho_preserve_phys(phys_addr_t phys, size_t size); @@ -60,6 +61,11 @@ static inline bool kho_is_enabled(void) return false; } +static inline bool is_kho_boot(void) +{ + return false; +} + static inline int kho_preserve_folio(struct folio *folio) { return -EOPNOTSUPP; diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index ecd1ac210dbd..49a39aee6a8e 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -951,6 +951,26 @@ static const void *kho_get_fdt(void) return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; } +/** + * is_kho_boot - check if current kernel was booted via KHO-enabled + * kexec + * + * This function checks if the current kernel was loaded through a kexec + * operation with KHO enabled, by verifying that a valid KHO FDT + * was passed. + * + * Note: This function returns reliable results only after + * kho_populate() has been called during early boot. Before that, + * it may return false even if KHO data is present. + * + * Return: true if booted via KHO-enabled kexec, false otherwise + */ +bool is_kho_boot(void) +{ + return !!kho_get_fdt(); +} +EXPORT_SYMBOL_GPL(is_kho_boot); + /** * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. * @name: the name of the sub FDT passed to kho_add_subtree(). -- cgit v1.2.3 From e19ceeb1c0f63e3e15b197c5f34797134b51ba0e Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Thu, 28 Aug 2025 08:35:58 +0000 Subject: platform/chrome: Centralize common cros_ec_device initialization Move the common initialization from protocol device drivers into central cros_ec_device_alloc(). This removes duplicated code from each driver's probe function. The buffer sizes are now calculated once, using the maximum possible overhead required by any of the transport protocols, ensuring the allocated buffers are sufficient for all cases. Link: https://lore.kernel.org/r/20250828083601.856083-3-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec.c | 9 +++++++++ drivers/platform/chrome/cros_ec_i2c.c | 5 ----- drivers/platform/chrome/cros_ec_ishtp.c | 4 ---- drivers/platform/chrome/cros_ec_lpc.c | 4 ---- drivers/platform/chrome/cros_ec_rpmsg.c | 4 ---- drivers/platform/chrome/cros_ec_spi.c | 5 ----- drivers/platform/chrome/cros_ec_uart.c | 4 ---- include/linux/platform_data/cros_ec_proto.h | 14 ++++++++++---- 8 files changed, 19 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c index 25283a148ab9..da049068b6e9 100644 --- a/drivers/platform/chrome/cros_ec.c +++ b/drivers/platform/chrome/cros_ec.c @@ -38,6 +38,15 @@ struct cros_ec_device *cros_ec_device_alloc(struct device *dev) if (!ec_dev) return NULL; + ec_dev->din_size = sizeof(struct ec_host_response) + + sizeof(struct ec_response_get_protocol_info) + + EC_MAX_RESPONSE_OVERHEAD; + ec_dev->dout_size = sizeof(struct ec_host_request) + + sizeof(struct ec_params_rwsig_action) + + EC_MAX_REQUEST_OVERHEAD; + + ec_dev->dev = dev; + return ec_dev; } EXPORT_SYMBOL(cros_ec_device_alloc); diff --git a/drivers/platform/chrome/cros_ec_i2c.c b/drivers/platform/chrome/cros_ec_i2c.c index ee3c5130ec3f..def1144a077e 100644 --- a/drivers/platform/chrome/cros_ec_i2c.c +++ b/drivers/platform/chrome/cros_ec_i2c.c @@ -297,16 +297,11 @@ static int cros_ec_i2c_probe(struct i2c_client *client) return -ENOMEM; i2c_set_clientdata(client, ec_dev); - ec_dev->dev = dev; ec_dev->priv = client; ec_dev->irq = client->irq; ec_dev->cmd_xfer = cros_ec_cmd_xfer_i2c; ec_dev->pkt_xfer = cros_ec_pkt_xfer_i2c; ec_dev->phys_name = client->adapter->name; - ec_dev->din_size = sizeof(struct ec_host_response_i2c) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request_i2c) + - sizeof(struct ec_params_rwsig_action); err = cros_ec_register(ec_dev); if (err) { diff --git a/drivers/platform/chrome/cros_ec_ishtp.c b/drivers/platform/chrome/cros_ec_ishtp.c index c102a796670c..4e74e702c5a2 100644 --- a/drivers/platform/chrome/cros_ec_ishtp.c +++ b/drivers/platform/chrome/cros_ec_ishtp.c @@ -550,14 +550,10 @@ static int cros_ec_dev_init(struct ishtp_cl_data *client_data) client_data->ec_dev = ec_dev; dev->driver_data = ec_dev; - ec_dev->dev = dev; ec_dev->priv = client_data->cros_ish_cl; ec_dev->cmd_xfer = NULL; ec_dev->pkt_xfer = cros_ec_pkt_xfer_ish; ec_dev->phys_name = dev_name(dev); - ec_dev->din_size = sizeof(struct cros_ish_in_msg) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct cros_ish_out_msg) + sizeof(struct ec_params_rwsig_action); return cros_ec_register(ec_dev); } diff --git a/drivers/platform/chrome/cros_ec_lpc.c b/drivers/platform/chrome/cros_ec_lpc.c index 30fa89b81666..78cfff80cdea 100644 --- a/drivers/platform/chrome/cros_ec_lpc.c +++ b/drivers/platform/chrome/cros_ec_lpc.c @@ -642,14 +642,10 @@ static int cros_ec_lpc_probe(struct platform_device *pdev) return -ENOMEM; platform_set_drvdata(pdev, ec_dev); - ec_dev->dev = dev; ec_dev->phys_name = dev_name(dev); ec_dev->cmd_xfer = cros_ec_cmd_xfer_lpc; ec_dev->pkt_xfer = cros_ec_pkt_xfer_lpc; ec_dev->cmd_readmem = cros_ec_lpc_readmem; - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); ec_dev->priv = ec_lpc; /* diff --git a/drivers/platform/chrome/cros_ec_rpmsg.c b/drivers/platform/chrome/cros_ec_rpmsg.c index 9ac2b923db6d..09bd9e49464e 100644 --- a/drivers/platform/chrome/cros_ec_rpmsg.c +++ b/drivers/platform/chrome/cros_ec_rpmsg.c @@ -224,14 +224,10 @@ static int cros_ec_rpmsg_probe(struct rpmsg_device *rpdev) if (!ec_rpmsg) return -ENOMEM; - ec_dev->dev = dev; ec_dev->priv = ec_rpmsg; ec_dev->cmd_xfer = cros_ec_cmd_xfer_rpmsg; ec_dev->pkt_xfer = cros_ec_pkt_xfer_rpmsg; ec_dev->phys_name = dev_name(&rpdev->dev); - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); dev_set_drvdata(dev, ec_dev); ec_rpmsg->rpdev = rpdev; diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c index c778300a4145..28fa82f8cb07 100644 --- a/drivers/platform/chrome/cros_ec_spi.c +++ b/drivers/platform/chrome/cros_ec_spi.c @@ -757,16 +757,11 @@ static int cros_ec_spi_probe(struct spi_device *spi) cros_ec_spi_dt_probe(ec_spi, dev); spi_set_drvdata(spi, ec_dev); - ec_dev->dev = dev; ec_dev->priv = ec_spi; ec_dev->irq = spi->irq; ec_dev->cmd_xfer = cros_ec_cmd_xfer_spi; ec_dev->pkt_xfer = cros_ec_pkt_xfer_spi; ec_dev->phys_name = dev_name(&ec_spi->spi->dev); - ec_dev->din_size = EC_MSG_PREAMBLE_COUNT + - sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); ec_spi->last_transfer_ns = ktime_get_ns(); diff --git a/drivers/platform/chrome/cros_ec_uart.c b/drivers/platform/chrome/cros_ec_uart.c index 1a7511b1bbe3..d5b37414ff12 100644 --- a/drivers/platform/chrome/cros_ec_uart.c +++ b/drivers/platform/chrome/cros_ec_uart.c @@ -276,14 +276,10 @@ static int cros_ec_uart_probe(struct serdev_device *serdev) /* Initialize ec_dev for cros_ec */ ec_dev->phys_name = dev_name(dev); - ec_dev->dev = dev; ec_dev->priv = ec_uart; ec_dev->irq = ec_uart->irq; ec_dev->cmd_xfer = NULL; ec_dev->pkt_xfer = cros_ec_uart_pkt_xfer; - ec_dev->din_size = sizeof(struct ec_host_response) + - sizeof(struct ec_response_get_protocol_info); - ec_dev->dout_size = sizeof(struct ec_host_request) + sizeof(struct ec_params_rwsig_action); serdev_device_set_client_ops(serdev, &cros_ec_uart_client_ops); diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 3ec24f445c29..4d96cffbb9e3 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -33,12 +33,18 @@ /* * Max bus-specific overhead incurred by request/responses. - * I2C requires 1 additional byte for requests. - * I2C requires 2 additional bytes for responses. - * SPI requires up to 32 additional bytes for responses. + * + * Request: + * - I2C requires 1 byte (see struct ec_host_request_i2c). + * - ISHTP requires 4 bytes (see struct cros_ish_out_msg). + * + * Response: + * - I2C requires 2 bytes (see struct ec_host_response_i2c). + * - ISHTP requires 4 bytes (see struct cros_ish_in_msg). + * - SPI requires 32 bytes (see EC_MSG_PREAMBLE_COUNT). */ #define EC_PROTO_VERSION_UNKNOWN 0 -#define EC_MAX_REQUEST_OVERHEAD 1 +#define EC_MAX_REQUEST_OVERHEAD 4 #define EC_MAX_RESPONSE_OVERHEAD 32 /* -- cgit v1.2.3 From 56cb557279d70397cefb497e0f06bdd6fd685f8e Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Thu, 28 Aug 2025 08:36:00 +0000 Subject: platform/chrome: cros_ec: Add a flag to track registration state Introduce a `registered` flag to the `struct cros_ec_device` to allow callers to determine if the device has been fully registered and is ready for use. This is a preparatory step to prevent race conditions where other drivers might try to access the device before it is fully registered or after it has been unregistered. Link: https://lore.kernel.org/r/20250828083601.856083-5-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih --- drivers/platform/chrome/cros_ec.c | 7 +++++++ drivers/platform/chrome/cros_ec_proto.c | 15 +++++++++++++++ include/linux/platform_data/cros_ec_proto.h | 4 ++++ 3 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/drivers/platform/chrome/cros_ec.c b/drivers/platform/chrome/cros_ec.c index 61bcef8741db..1da79e3d215b 100644 --- a/drivers/platform/chrome/cros_ec.c +++ b/drivers/platform/chrome/cros_ec.c @@ -9,6 +9,7 @@ * battery charging and regulator control, firmware update. */ +#include #include #include #include @@ -316,6 +317,9 @@ int cros_ec_register(struct cros_ec_device *ec_dev) goto exit; } + scoped_guard(mutex, &ec_dev->lock) + ec_dev->registered = true; + dev_info(dev, "Chrome EC device registered\n"); /* @@ -343,6 +347,9 @@ EXPORT_SYMBOL(cros_ec_register); */ void cros_ec_unregister(struct cros_ec_device *ec_dev) { + scoped_guard(mutex, &ec_dev->lock) + ec_dev->registered = false; + if (ec_dev->mkbp_event_supported) blocking_notifier_chain_unregister(&ec_dev->event_notifier, &ec_dev->notifier_ready); diff --git a/drivers/platform/chrome/cros_ec_proto.c b/drivers/platform/chrome/cros_ec_proto.c index 3e94a0a82173..1d8d9168ec1a 100644 --- a/drivers/platform/chrome/cros_ec_proto.c +++ b/drivers/platform/chrome/cros_ec_proto.c @@ -3,6 +3,7 @@ // // Copyright (C) 2015 Google, Inc +#include #include #include #include @@ -1153,5 +1154,19 @@ int cros_ec_get_cmd_versions(struct cros_ec_device *ec_dev, u16 cmd) } EXPORT_SYMBOL_GPL(cros_ec_get_cmd_versions); +/** + * cros_ec_device_registered - Return if the ec_dev is registered. + * + * @ec_dev: EC device + * + * Return: true if registered. Otherwise, false. + */ +bool cros_ec_device_registered(struct cros_ec_device *ec_dev) +{ + guard(mutex)(&ec_dev->lock); + return ec_dev->registered; +} +EXPORT_SYMBOL_GPL(cros_ec_device_registered); + MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ChromeOS EC communication protocol helpers"); diff --git a/include/linux/platform_data/cros_ec_proto.h b/include/linux/platform_data/cros_ec_proto.h index 4d96cffbb9e3..de14923720a5 100644 --- a/include/linux/platform_data/cros_ec_proto.h +++ b/include/linux/platform_data/cros_ec_proto.h @@ -128,6 +128,7 @@ struct cros_ec_command { * @dout_size: Size of dout buffer to allocate (zero to use static dout). * @wake_enabled: True if this device can wake the system from sleep. * @suspended: True if this device had been suspended. + * @registered: True if this device had been registered. * @cmd_xfer: Send command to EC and get response. * Returns the number of bytes received if the communication * succeeded, but that doesn't mean the EC was happy with the @@ -186,6 +187,7 @@ struct cros_ec_device { int dout_size; bool wake_enabled; bool suspended; + bool registered; int (*cmd_xfer)(struct cros_ec_device *ec, struct cros_ec_command *msg); int (*pkt_xfer)(struct cros_ec_device *ec, @@ -278,6 +280,8 @@ int cros_ec_cmd_readmem(struct cros_ec_device *ec_dev, u8 offset, u8 size, void int cros_ec_get_cmd_versions(struct cros_ec_device *ec_dev, u16 cmd); +bool cros_ec_device_registered(struct cros_ec_device *ec_dev); + /** * cros_ec_get_time_ns() - Return time in ns. * -- cgit v1.2.3 From e68f150bc11d0a05cbe984a4e5c0f72a95cae07d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 18 Aug 2025 09:46:15 +0300 Subject: memblock: drop for_each_free_mem_pfn_range_in_zone_from() for_each_free_mem_pfn_range_in_zone_from() and its "backend" implementation __next_mem_pfn_range_in_zone() were only used by deferred initialization of the memory map. Remove them as they are not used anymore. Reviewed-by: Wei Yang Reviewed-by: David Hildenbrand Signed-off-by: Mike Rapoport (Microsoft) --- .clang-format | 1 - include/linux/memblock.h | 22 ----------------- mm/memblock.c | 64 ------------------------------------------------ 3 files changed, 87 deletions(-) (limited to 'include/linux') diff --git a/.clang-format b/.clang-format index 48405c54ef27..f371a13b4d19 100644 --- a/.clang-format +++ b/.clang-format @@ -294,7 +294,6 @@ ForEachMacros: - 'for_each_fib6_node_rt_rcu' - 'for_each_fib6_walker_rt' - 'for_each_file_lock' - - 'for_each_free_mem_pfn_range_in_zone_from' - 'for_each_free_mem_range' - 'for_each_free_mem_range_reverse' - 'for_each_func_rsrc' diff --git a/include/linux/memblock.h b/include/linux/memblock.h index fcda8481de9a..221118b5a16e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -324,28 +324,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, - unsigned long *out_spfn, - unsigned long *out_epfn); - -/** - * for_each_free_mem_pfn_range_in_zone_from - iterate through zone specific - * free memblock areas from a given point - * @i: u64 used as loop variable - * @zone: zone in which all of the memory blocks reside - * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL - * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL - * - * Walks over free (memory && !reserved) areas of memblock in a specific - * zone, continuing from current position. Available as soon as memblock is - * initialized. - */ -#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ - for (; i != U64_MAX; \ - __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) - -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** * for_each_free_mem_range - iterate through free memblock areas diff --git a/mm/memblock.c b/mm/memblock.c index 117d963e677c..120a501a887a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1445,70 +1445,6 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return 0; } -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -/** - * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() - * - * @idx: pointer to u64 loop variable - * @zone: zone in which all of the memory blocks reside - * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL - * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL - * - * This function is meant to be a zone/pfn specific wrapper for the - * for_each_mem_range type iterators. Specifically they are used in the - * deferred memory init routines and as such we were duplicating much of - * this logic throughout the code. So instead of having it in multiple - * locations it seemed like it would make more sense to centralize this to - * one new iterator that does everything they need. - */ -void __init_memblock -__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, - unsigned long *out_spfn, unsigned long *out_epfn) -{ - int zone_nid = zone_to_nid(zone); - phys_addr_t spa, epa; - - __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, - &memblock.memory, &memblock.reserved, - &spa, &epa, NULL); - - while (*idx != U64_MAX) { - unsigned long epfn = PFN_DOWN(epa); - unsigned long spfn = PFN_UP(spa); - - /* - * Verify the end is at least past the start of the zone and - * that we have at least one PFN to initialize. - */ - if (zone->zone_start_pfn < epfn && spfn < epfn) { - /* if we went too far just stop searching */ - if (zone_end_pfn(zone) <= spfn) { - *idx = U64_MAX; - break; - } - - if (out_spfn) - *out_spfn = max(zone->zone_start_pfn, spfn); - if (out_epfn) - *out_epfn = min(zone_end_pfn(zone), epfn); - - return; - } - - __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, - &memblock.memory, &memblock.reserved, - &spa, &epa, NULL); - } - - /* signal end of iteration */ - if (out_spfn) - *out_spfn = ULONG_MAX; - if (out_epfn) - *out_epfn = 0; -} - -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ - /** * memblock_alloc_range_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes -- cgit v1.2.3 From fdae0ab67d57d480dc61e9fb45678bbdc3786711 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 Sep 2025 12:19:42 +0000 Subject: net: use NUMA drop counters for softnet_data.dropped Hosts under DOS attack can suffer from false sharing in enqueue_to_backlog() : atomic_inc(&sd->dropped). This is because sd->dropped can be touched from many cpus, possibly residing on different NUMA nodes. Generalize the sk_drop_counters infrastucture added in commit c51613fa276f ("net: add sk->sk_drop_counters") and use it to replace softnet_data.dropped with NUMA friendly softnet_data.drop_counters. This adds 64 bytes per cpu, maybe more in the future if we increase the number of counters (currently 2) per 'struct numa_drop_counters'. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250909121942.1202585-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 2 +- include/linux/netdevice.h | 28 +++++++++++++++++++++++++++- include/linux/udp.h | 2 +- include/net/raw.h | 2 +- include/net/sock.h | 37 ++++++++++++------------------------- net/core/dev.c | 2 +- net/core/net-procfs.c | 3 ++- 7 files changed, 45 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 261d02efb615..f43314517396 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -295,7 +295,7 @@ struct raw6_sock { __u32 offset; /* checksum offset */ struct icmp6_filter filter; __u32 ip6mr_table; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; struct ipv6_pinfo inet6; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f3a3b761abfb..f5a840c07cf1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3459,6 +3459,32 @@ static inline bool dev_has_header(const struct net_device *dev) return dev->header_ops && dev->header_ops->create; } +struct numa_drop_counters { + atomic_t drops0 ____cacheline_aligned_in_smp; + atomic_t drops1 ____cacheline_aligned_in_smp; +}; + +static inline int numa_drop_read(const struct numa_drop_counters *ndc) +{ + return atomic_read(&ndc->drops0) + atomic_read(&ndc->drops1); +} + +static inline void numa_drop_add(struct numa_drop_counters *ndc, int val) +{ + int n = numa_node_id() % 2; + + if (n) + atomic_add(val, &ndc->drops1); + else + atomic_add(val, &ndc->drops0); +} + +static inline void numa_drop_reset(struct numa_drop_counters *ndc) +{ + atomic_set(&ndc->drops0, 0); + atomic_set(&ndc->drops1, 0); +} + /* * Incoming packets are placed on per-CPU queues */ @@ -3504,7 +3530,7 @@ struct softnet_data { struct sk_buff_head input_pkt_queue; struct napi_struct backlog; - atomic_t dropped ____cacheline_aligned_in_smp; + struct numa_drop_counters drop_counters; /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; diff --git a/include/linux/udp.h b/include/linux/udp.h index 981506be1e15..6ed008ab1665 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -108,7 +108,7 @@ struct udp_sock { * the last UDP socket cacheline. */ struct hlist_node tunnel_list; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/raw.h b/include/net/raw.h index d52709139060..66c0ffeada2e 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,7 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; - struct socket_drop_counters drop_counters; + struct numa_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/include/net/sock.h b/include/net/sock.h index 896bec2d2176..0fd465935334 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -102,11 +102,6 @@ struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; -struct socket_drop_counters { - atomic_t drops0 ____cacheline_aligned_in_smp; - atomic_t drops1 ____cacheline_aligned_in_smp; -}; - /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr @@ -287,7 +282,7 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter - * @sk_drop_counters: optional pointer to socket_drop_counters + * @sk_drop_counters: optional pointer to numa_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner @@ -456,7 +451,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct socket_drop_counters *sk_drop_counters; + struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -2698,18 +2693,12 @@ struct sock_skb_cb { static inline void sk_drops_add(struct sock *sk, int segs) { - struct socket_drop_counters *sdc = sk->sk_drop_counters; + struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { - int n = numa_node_id() % 2; - - if (n) - atomic_add(segs, &sdc->drops1); - else - atomic_add(segs, &sdc->drops0); - } else { + if (ndc) + numa_drop_add(ndc, segs); + else atomic_add(segs, &sk->sk_drops); - } } static inline void sk_drops_inc(struct sock *sk) @@ -2719,23 +2708,21 @@ static inline void sk_drops_inc(struct sock *sk) static inline int sk_drops_read(const struct sock *sk) { - const struct socket_drop_counters *sdc = sk->sk_drop_counters; + const struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { + if (ndc) { DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); - return atomic_read(&sdc->drops0) + atomic_read(&sdc->drops1); + return numa_drop_read(ndc); } return atomic_read(&sk->sk_drops); } static inline void sk_drops_reset(struct sock *sk) { - struct socket_drop_counters *sdc = sk->sk_drop_counters; + struct numa_drop_counters *ndc = sk->sk_drop_counters; - if (sdc) { - atomic_set(&sdc->drops0, 0); - atomic_set(&sdc->drops1, 0); - } + if (ndc) + numa_drop_reset(ndc); atomic_set(&sk->sk_drops, 0); } diff --git a/net/core/dev.c b/net/core/dev.c index 1d1650d9ecff..2522d9d8f0e4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5248,7 +5248,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: - atomic_inc(&sd->dropped); + numa_drop_add(&sd->drop_counters, 1); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 4f0f0709a1cb..70e0e9a3b650 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -145,7 +145,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->processed), + numa_drop_read(&sd->drop_counters), READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ -- cgit v1.2.3 From ae1c658b33d4bec20c037aebba583a68375d4773 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 11 Sep 2025 15:08:31 +0200 Subject: net: phy: introduce phy_id_compare_model() PHY ID helper Similar to phy_id_compare_vendor(), introduce the equivalent phy_id_compare_model() helper for the generic PHY ID Model mask. Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250911130840.23569-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 04553419adc3..6f3b25cb7f4e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1308,6 +1308,19 @@ static inline bool phy_id_compare_vendor(u32 id, u32 vendor_mask) return phy_id_compare(id, vendor_mask, PHY_ID_MATCH_VENDOR_MASK); } +/** + * phy_id_compare_model - compare @id with @model mask + * @id: PHY ID + * @model_mask: PHY Model mask + * + * Return: true if the bits from @id match @model using the + * generic PHY Model mask. + */ +static inline bool phy_id_compare_model(u32 id, u32 model_mask) +{ + return phy_id_compare(id, model_mask, PHY_ID_MATCH_MODEL_MASK); +} + /** * phydev_id_compare - compare @id with the PHY's Clause 22 ID * @phydev: the PHY device -- cgit v1.2.3 From 3afaff7a0ce97457c8ab46862f2c06603a89962e Mon Sep 17 00:00:00 2001 From: Akhilesh Patil Date: Mon, 11 Aug 2025 17:42:53 +0530 Subject: include/linux/rv.h: remove redundant include file Remove redundant include to clean up the code. Move all unique include files inside CONFIG_RV as they are only needed when CONFIG_RV is enabled. Arrange include files alphabetically. Fixes: 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") [1] Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202507312017.oyD08TL5-lkp@intel.com/ Signed-off-by: Akhilesh Patil Reviewed-by: Gabriele Monaco Link: https://lore.kernel.org/r/aJneRbHGlNFg7lr9@bhairav-test.ee.iitb.ac.in Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rv.h b/include/linux/rv.h index 14410a42faef..9520aab34bcb 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -7,16 +7,14 @@ #ifndef _LINUX_RV_H #define _LINUX_RV_H -#include -#include - #define MAX_DA_NAME_LEN 32 #define MAX_DA_RETRY_RACING_EVENTS 3 #ifdef CONFIG_RV +#include #include +#include #include -#include /* * Deterministic automaton per-object variables. -- cgit v1.2.3 From e7c9b66b106989aeb17b167f5bbea9a108d26c0d Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 17 Jul 2025 17:11:16 +0200 Subject: pwm: Provide a gpio device for waveform drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A PWM is a more general concept than an output-only GPIO. When using duty_length = period_length the PWM looks like an active GPIO, with duty_length = 0 like an inactive GPIO. With the waveform abstraction there is enough control over the configuration to ensure that PWMs that cannot generate a constant signal at both levels error out. The pwm-pca9685 driver already provides a gpio chip. When this driver is converted to the waveform callbacks, the gpio part can just be dropped. Signed-off-by: Uwe Kleine-König Reviewed-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20250717151117.1828585-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- drivers/pwm/Kconfig | 9 +++++++ drivers/pwm/core.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pwm.h | 3 +++ 3 files changed, 84 insertions(+) (limited to 'include/linux') diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index f00ce973dddf..2aa24608a8df 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -38,6 +38,15 @@ config PWM_DEBUG It is expected to introduce some runtime overhead and diagnostic output to the kernel log, so only enable while working on a driver. +config PWM_PROVIDE_GPIO + bool "Provide a GPIO chip for each PWM chip" + depends on GPIOLIB + help + Most PWMs can emit both a constant active high and a constant active + low signal and so they can be used as GPIO. Say Y here to let each + PWM chip provide a GPIO chip and so be easily plugged into consumers + that know how to handle GPIOs but not PWMs. + config PWM_AB8500 tristate "AB8500 PWM support" depends on AB8500_CORE && ARCH_U8500 diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 2570ad6a7f59..ea2ccf42e814 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -2391,6 +2391,51 @@ static const struct file_operations pwm_cdev_fileops = { static dev_t pwm_devt; +static int pwm_gpio_request(struct gpio_chip *gc, unsigned int offset) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + struct pwm_device *pwm; + + pwm = pwm_request_from_chip(chip, offset, "pwm-gpio"); + if (IS_ERR(pwm)) + return PTR_ERR(pwm); + + return 0; +} + +static void pwm_gpio_free(struct gpio_chip *gc, unsigned int offset) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + + pwm_put(&chip->pwms[offset]); +} + +static int pwm_gpio_get_direction(struct gpio_chip *gc, unsigned int offset) +{ + return GPIO_LINE_DIRECTION_OUT; +} + +static int pwm_gpio_set(struct gpio_chip *gc, unsigned int offset, int value) +{ + struct pwm_chip *chip = gpiochip_get_data(gc); + struct pwm_device *pwm = &chip->pwms[offset]; + int ret; + struct pwm_waveform wf = { + .period_length_ns = 1, + }; + + ret = pwm_round_waveform_might_sleep(pwm, &wf); + if (ret < 0) + return ret; + + if (value) + wf.duty_length_ns = wf.period_length_ns; + else + wf.duty_length_ns = 0; + + return pwm_set_waveform_might_sleep(pwm, &wf, true); +} + /** * __pwmchip_add() - register a new PWM chip * @chip: the PWM chip to add @@ -2457,9 +2502,33 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner) if (ret) goto err_device_add; + if (IS_ENABLED(CONFIG_PWM_PROVIDE_GPIO) && chip->ops->write_waveform) { + struct device *parent = pwmchip_parent(chip); + + chip->gpio = (typeof(chip->gpio)){ + .label = dev_name(parent), + .parent = parent, + .request = pwm_gpio_request, + .free = pwm_gpio_free, + .get_direction = pwm_gpio_get_direction, + .set = pwm_gpio_set, + .base = -1, + .ngpio = chip->npwm, + .can_sleep = true, + }; + + ret = gpiochip_add_data(&chip->gpio, chip); + if (ret) + goto err_gpiochip_add; + } + return 0; +err_gpiochip_add: + + cdev_device_del(&chip->cdev, &chip->dev); err_device_add: + scoped_guard(pwmchip, chip) chip->operational = false; @@ -2480,6 +2549,9 @@ EXPORT_SYMBOL_GPL(__pwmchip_add); */ void pwmchip_remove(struct pwm_chip *chip) { + if (IS_ENABLED(CONFIG_PWM_PROVIDE_GPIO) && chip->ops->write_waveform) + gpiochip_remove(&chip->gpio); + pwmchip_sysfs_unexport(chip); scoped_guard(mutex, &pwm_lock) { diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 8cafc483db53..549ac4aaad59 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -321,6 +322,7 @@ struct pwm_ops { * @npwm: number of PWMs controlled by this chip * @of_xlate: request a PWM device given a device tree PWM specifier * @atomic: can the driver's ->apply() be called in atomic context + * @gpio: &struct gpio_chip to operate this PWM chip's lines as GPO * @uses_pwmchip_alloc: signals if pwmchip_allow was used to allocate this chip * @operational: signals if the chip can be used (or is already deregistered) * @nonatomic_lock: mutex for nonatomic chips @@ -340,6 +342,7 @@ struct pwm_chip { bool atomic; /* only used internally by the PWM framework */ + struct gpio_chip gpio; bool uses_pwmchip_alloc; bool operational; union { -- cgit v1.2.3 From 09f37134464cc03baf5cb8eab2d99db27ee73217 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:00 -0500 Subject: x86,fs/resctrl: Consolidate monitor event descriptions There are currently only three monitor events, all associated with the RDT_RESOURCE_L3 resource. Growing support for additional events will be easier with some restructuring to have a single point in file system code where all attributes of all events are defined. Place all event descriptions into an array mon_event_all[]. Doing this has the beneficial side effect of removing the need for rdt_resource::evt_list. Add resctrl_event_id::QOS_FIRST_EVENT for a lower bound on range checks for event ids and as the starting index to scan mon_event_all[]. Drop the code that builds evt_list and change the two places where the list is scanned to scan mon_event_all[] instead using a new helper macro for_each_mon_event(). Architecture code now informs file system code which events are available with resctrl_enable_mon_event(). Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 12 ++++++-- fs/resctrl/internal.h | 13 ++++++-- fs/resctrl/monitor.c | 63 +++++++++++++++++++------------------- fs/resctrl/rdtgroup.c | 11 ++++--- include/linux/resctrl.h | 4 +-- include/linux/resctrl_types.h | 12 +++++--- 6 files changed, 66 insertions(+), 49 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 187d527ef73b..7fcae25874fe 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -864,12 +864,18 @@ static __init bool get_rdt_mon_resources(void) { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; - if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) { + resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); - if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + } + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) { + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + } if (!rdt_mon_features) return false; diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9a8cf6f11151..7a57366d1abc 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -52,19 +52,26 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) } /** - * struct mon_evt - Entry in the event list of a resource + * struct mon_evt - Properties of a monitor event * @evtid: event id + * @rid: resource id for this event * @name: name of the event * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list + * @enabled: true if the event is enabled */ struct mon_evt { enum resctrl_event_id evtid; + enum resctrl_res_level rid; char *name; bool configurable; - struct list_head list; + bool enabled; }; +extern struct mon_evt mon_event_all[QOS_NUM_EVENTS]; + +#define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT]; \ + mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++) + /** * struct mon_data - Monitoring details for each event file. * @list: Member of the global @mon_data_kn_priv_list list. diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 7326c28a7908..d5bf3e0334b6 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -842,38 +842,39 @@ out_unlock: mutex_unlock(&rdtgroup_mutex); } -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, -}; - -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, -}; - -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, -}; - /* - * Initialize the event list for the resource. - * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. + * All available events. Architecture code marks the ones that + * are supported by a system using resctrl_enable_mon_event() + * to set .enabled. */ -static void l3_mon_evt_init(struct rdt_resource *r) +struct mon_evt mon_event_all[QOS_NUM_EVENTS] = { + [QOS_L3_OCCUP_EVENT_ID] = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_TOTAL_EVENT_ID] = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, + [QOS_L3_MBM_LOCAL_EVENT_ID] = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, + .rid = RDT_RESOURCE_L3, + }, +}; + +void resctrl_enable_mon_event(enum resctrl_event_id eventid) { - INIT_LIST_HEAD(&r->evt_list); + if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS)) + return; + if (mon_event_all[eventid].enabled) { + pr_warn("Duplicate enable for event %d\n", eventid); + return; + } - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); + mon_event_all[eventid].enabled = true; } /** @@ -900,15 +901,13 @@ int resctrl_mon_resource_init(void) if (ret) return ret; - l3_mon_evt_init(r); - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_total_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].configurable = true; resctrl_file_fflags_init("mbm_local_bytes_config", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 5f0b7cfa1cc2..ab943a5907c5 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1152,7 +1152,9 @@ static int rdt_mon_features_show(struct kernfs_open_file *of, struct rdt_resource *r = rdt_kn_parent_priv(of->kn); struct mon_evt *mevt; - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; seq_printf(seq, "%s\n", mevt->name); if (mevt->configurable) seq_printf(seq, "%s_config\n", mevt->name); @@ -3054,10 +3056,9 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, struct mon_evt *mevt; int ret, domid; - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - list_for_each_entry(mevt, &r->evt_list, list) { + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled) + continue; domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 6fb4894b8cfd..2944042bd84c 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -269,7 +269,6 @@ enum resctrl_schema_fmt { * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @evt_list: List of monitoring events * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource @@ -287,7 +286,6 @@ struct rdt_resource { struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - struct list_head evt_list; unsigned int mbm_cfg_mask; bool cdp_capable; }; @@ -372,6 +370,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r); u32 resctrl_arch_system_num_rmid_idx(void); int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); +void resctrl_enable_mon_event(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); /** diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index a25fb9c4070d..2dadbc54e4b3 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,11 +34,15 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) -/* - * Event IDs, the values match those used to program IA32_QM_EVTSEL before - * reading IA32_QM_CTR on RDT systems. - */ +/* Event IDs */ enum resctrl_event_id { + /* Must match value of first event below */ + QOS_FIRST_EVENT = 0x01, + + /* + * These values match those used to program IA32_QM_EVTSEL before + * reading IA32_QM_CTR on RDT systems. + */ QOS_L3_OCCUP_EVENT_ID = 0x01, QOS_L3_MBM_TOTAL_EVENT_ID = 0x02, QOS_L3_MBM_LOCAL_EVENT_ID = 0x03, -- cgit v1.2.3 From d257cc2e5c8bb8236cb161360d6c0529fd442712 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:01 -0500 Subject: x86,fs/resctrl: Replace architecture event enabled checks The resctrl file system now has complete knowledge of the status of every event. So there is no need for per-event function calls to check. Replace each of the resctrl_arch_is_{event}enabled() calls with resctrl_is_mon_event_enabled(QOS_{EVENT}). No functional change. Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Fenghua Yu Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/include/asm/resctrl.h | 15 --------------- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- arch/x86/kernel/cpu/resctrl/monitor.c | 4 ++-- fs/resctrl/ctrlmondata.c | 4 ++-- fs/resctrl/monitor.c | 16 +++++++++++----- fs/resctrl/rdtgroup.c | 18 +++++++++--------- include/linux/resctrl.h | 2 ++ 7 files changed, 28 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index feb93b50e990..b1dd5d6b87db 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -84,21 +84,6 @@ static inline void resctrl_arch_disable_mon(void) static_branch_dec_cpuslocked(&rdt_enable_key); } -static inline bool resctrl_arch_is_llc_occupancy_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_total_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); -} - -static inline bool resctrl_arch_is_mbm_local_enabled(void) -{ - return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); -} - /* * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7fcae25874fe..1a319ce9328c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -402,13 +402,13 @@ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { size_t tsize; - if (resctrl_arch_is_mbm_total_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { tsize = sizeof(*hw_dom->arch_mbm_total); hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_total) return -ENOMEM; } - if (resctrl_arch_is_mbm_local_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { tsize = sizeof(*hw_dom->arch_mbm_local); hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); if (!hw_dom->arch_mbm_local) { diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index c261558276cd..61d38517e2bf 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -207,11 +207,11 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) memset(hw_dom->arch_mbm_total, 0, sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) memset(hw_dom->arch_mbm_local, 0, sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); } diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 3c39cfacb251..42b281b3852f 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -473,12 +473,12 @@ ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, rdt_last_cmd_clear(); if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; else ret = -EINVAL; } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; else ret = -EINVAL; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index d5bf3e0334b6..b0b1dcc36795 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -336,7 +336,7 @@ void free_rmid(u32 closid, u32 rmid) entry = __rmid_entry(idx); - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) add_rmid_to_limbo(entry); else list_add_tail(&entry->list, &rmid_free_lru); @@ -635,10 +635,10 @@ static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, * This is protected from concurrent reads from user as both * the user and overflow handler hold the global mutex. */ - if (resctrl_arch_is_mbm_total_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); } @@ -877,6 +877,12 @@ void resctrl_enable_mon_event(enum resctrl_event_id eventid) mon_event_all[eventid].enabled = true; } +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) +{ + return eventid >= QOS_FIRST_EVENT && eventid < QOS_NUM_EVENTS && + mon_event_all[eventid].enabled; +} + /** * resctrl_mon_resource_init() - Initialise global monitoring structures. * @@ -912,9 +918,9 @@ int resctrl_mon_resource_init(void) RFTYPE_MON_INFO | RFTYPE_RES_CACHE); } - if (resctrl_arch_is_mbm_local_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) + else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index ab943a5907c5..2ca8e66c0d20 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -123,8 +123,8 @@ void rdt_staged_configs_clear(void) static bool resctrl_is_mbm_enabled(void) { - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); + return (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID) || + resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } static bool resctrl_is_mbm_event(int e) @@ -196,7 +196,7 @@ static int closid_alloc(void) lockdep_assert_held(&rdtgroup_mutex); if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { + resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { cleanest_closid = resctrl_find_cleanest_closid(); if (cleanest_closid < 0) return cleanest_closid; @@ -4048,7 +4048,7 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d if (resctrl_is_mbm_enabled()) cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && has_busy_rmid(d)) { /* * When a package is going down, forcefully * decrement rmid->ebusy. There is no way to know @@ -4084,12 +4084,12 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain u32 idx_limit = resctrl_arch_system_num_rmid_idx(); size_t tsize; - if (resctrl_arch_is_llc_occupancy_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_arch_is_mbm_total_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { tsize = sizeof(*d->mbm_total); d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_total) { @@ -4097,7 +4097,7 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain return -ENOMEM; } } - if (resctrl_arch_is_mbm_local_enabled()) { + if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { tsize = sizeof(*d->mbm_local); d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); if (!d->mbm_local) { @@ -4142,7 +4142,7 @@ int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) RESCTRL_PICK_ANY_CPU); } - if (resctrl_arch_is_llc_occupancy_enabled()) + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); /* @@ -4217,7 +4217,7 @@ void resctrl_offline_cpu(unsigned int cpu) cancel_delayed_work(&d->mbm_over); mbm_setup_overflow_handler(d, 0, cpu); } - if (resctrl_arch_is_llc_occupancy_enabled() && + if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID) && cpu == d->cqm_work_cpu && has_busy_rmid(d)) { cancel_delayed_work(&d->cqm_limbo); cqm_setup_limbo_handler(d, 0, cpu); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 2944042bd84c..40aba6b5d4f0 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -372,6 +372,8 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid); void resctrl_enable_mon_event(enum resctrl_event_id eventid); +bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); + bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); /** -- cgit v1.2.3 From 7788255aba6545a27b8d143c5256536f8dfb2c0a Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Tue, 9 Sep 2025 10:00:06 +0000 Subject: KVM: Implement barriers before accessing kvm->buses[] on SRCU read paths This ensures that, if a VCPU has "observed" that an IO registration has occurred, the instruction currently being trapped or emulated will also observe the IO registration. At the same time, enforce that kvm_get_bus() is used only on the update side, ensuring that a long-term reference cannot be obtained by an SRCU reader. Signed-off-by: Keir Fraser Signed-off-by: Marc Zyngier --- arch/x86/kvm/vmx/vmx.c | 7 +++++++ include/linux/kvm_host.h | 10 +++++++--- virt/kvm/kvm_main.c | 32 ++++++++++++++++++++++++++------ 3 files changed, 40 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa157fe5b7b3..0bdf9405969a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5785,6 +5785,13 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; + /* + * Ensure that any updates to kvm->buses[] observed by the + * previous instruction (emulated or otherwise) are also + * visible to the instruction KVM is about to emulate. + */ + smp_rmb(); + if (!kvm_emulate_instruction(vcpu, 0)) return 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15656b7fba6c..e7d6111cf254 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -966,11 +966,15 @@ static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); } +/* + * Get a bus reference under the update-side lock. No long-term SRCU reader + * references are permitted, to avoid stale reads vs concurrent IO + * registrations. + */ static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) { - return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, - lockdep_is_held(&kvm->slots_lock) || - !refcount_read(&kvm->users_count)); + return rcu_dereference_protected(kvm->buses[idx], + lockdep_is_held(&kvm->slots_lock)); } static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c07dd423458..870ad8ea93a7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1103,6 +1103,14 @@ void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) { } +/* Called only on cleanup and destruction paths when there are no users. */ +static inline struct kvm_io_bus *kvm_get_bus_for_destruction(struct kvm *kvm, + enum kvm_bus idx) +{ + return rcu_dereference_protected(kvm->buses[idx], + !refcount_read(&kvm->users_count)); +} + static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); @@ -1228,7 +1236,7 @@ out_err_no_disable: out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) - kfree(kvm_get_bus(kvm, i)); + kfree(kvm_get_bus_for_destruction(kvm, i)); kvm_free_irq_routing(kvm); out_err_no_irq_routing: cleanup_srcu_struct(&kvm->irq_srcu); @@ -1276,7 +1284,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { - struct kvm_io_bus *bus = kvm_get_bus(kvm, i); + struct kvm_io_bus *bus = kvm_get_bus_for_destruction(kvm, i); if (bus) kvm_io_bus_destroy(bus); @@ -5843,6 +5851,18 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, return -EOPNOTSUPP; } +static struct kvm_io_bus *kvm_get_bus_srcu(struct kvm *kvm, enum kvm_bus idx) +{ + /* + * Ensure that any updates to kvm_buses[] observed by the previous vCPU + * machine instruction are also visible to the vCPU machine instruction + * that triggered this call. + */ + smp_mb__after_srcu_read_lock(); + + return srcu_dereference(kvm->buses[idx], &kvm->srcu); +} + int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { @@ -5855,7 +5875,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; r = __kvm_io_bus_write(vcpu, bus, &range, val); @@ -5874,7 +5894,7 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; @@ -5924,7 +5944,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, .len = len, }; - bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + bus = kvm_get_bus_srcu(vcpu->kvm, bus_idx); if (!bus) return -ENOMEM; r = __kvm_io_bus_read(vcpu, bus, &range, val); @@ -6033,7 +6053,7 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, srcu_idx = srcu_read_lock(&kvm->srcu); - bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + bus = kvm_get_bus_srcu(kvm, bus_idx); if (!bus) goto out_unlock; -- cgit v1.2.3 From 7d9a0273c45962e9a6bc06f3b87eef7c431c1853 Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Tue, 9 Sep 2025 10:00:07 +0000 Subject: KVM: Avoid synchronize_srcu() in kvm_io_bus_register_dev() Device MMIO registration may happen quite frequently during VM boot, and the SRCU synchronization each time has a measurable effect on VM startup time. In our experiments it can account for around 25% of a VM's startup time. Replace the synchronization with a deferred free of the old kvm_io_bus structure. Tested-by: Li RongQing Signed-off-by: Keir Fraser Signed-off-by: Marc Zyngier --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e7d6111cf254..103be35caf0d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -206,6 +206,7 @@ struct kvm_io_range { struct kvm_io_bus { int dev_count; int ioeventfd_count; + struct rcu_head rcu; struct kvm_io_range range[]; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 870ad8ea93a7..bcef324ccbf2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1320,6 +1320,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_free_memslots(kvm, &kvm->__memslots[i][1]); } cleanup_srcu_struct(&kvm->irq_srcu); + srcu_barrier(&kvm->srcu); cleanup_srcu_struct(&kvm->srcu); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES xa_destroy(&kvm->mem_attr_array); @@ -5952,6 +5953,13 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, } EXPORT_SYMBOL_GPL(kvm_io_bus_read); +static void __free_bus(struct rcu_head *rcu) +{ + struct kvm_io_bus *bus = container_of(rcu, struct kvm_io_bus, rcu); + + kfree(bus); +} + int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev) { @@ -5990,8 +5998,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, memcpy(new_bus->range + i + 1, bus->range + i, (bus->dev_count - i) * sizeof(struct kvm_io_range)); rcu_assign_pointer(kvm->buses[bus_idx], new_bus); - synchronize_srcu_expedited(&kvm->srcu); - kfree(bus); + call_srcu(&kvm->srcu, &bus->rcu, __free_bus); return 0; } -- cgit v1.2.3 From 83b039877310ae1eb614eef17b780df1e10d9fb5 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 5 Sep 2025 16:34:03 -0500 Subject: x86,fs/resctrl: Prepare for more monitor events There's a rule in computer programming that objects appear zero, once, or many times. So code accordingly. There are two MBM events and resctrl is coded with a lot of if (local) do one thing if (total) do a different thing Change the rdt_mon_domain and rdt_hw_mon_domain structures to hold arrays of pointers to per event data instead of explicit fields for total and local bandwidth. Simplify by coding for many events using loops on which are enabled. Move resctrl_is_mbm_event() to so it can be used more widely. Also provide a for_each_mbm_event_id() helper macro. Cleanup variable names in functions touched to consistently use "eventid" for those with type enum resctrl_event_id. Signed-off-by: Tony Luck Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 40 +++++++++++++++------------ arch/x86/kernel/cpu/resctrl/internal.h | 8 +++--- arch/x86/kernel/cpu/resctrl/monitor.c | 36 ++++++++++++------------ fs/resctrl/monitor.c | 13 ++++----- fs/resctrl/rdtgroup.c | 50 +++++++++++++++++----------------- include/linux/resctrl.h | 23 +++++++++++++--- include/linux/resctrl_types.h | 3 ++ 7 files changed, 96 insertions(+), 77 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 5d14f9a14eda..fbf019c1ff11 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -365,8 +365,10 @@ static void ctrl_domain_free(struct rdt_hw_ctrl_domain *hw_dom) static void mon_domain_free(struct rdt_hw_mon_domain *hw_dom) { - kfree(hw_dom->arch_mbm_total); - kfree(hw_dom->arch_mbm_local); + int idx; + + for_each_mbm_idx(idx) + kfree(hw_dom->arch_mbm_states[idx]); kfree(hw_dom); } @@ -400,25 +402,27 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_ctrl_domain * */ static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_mon_domain *hw_dom) { - size_t tsize; - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { - tsize = sizeof(*hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_total) - return -ENOMEM; - } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { - tsize = sizeof(*hw_dom->arch_mbm_local); - hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL); - if (!hw_dom->arch_mbm_local) { - kfree(hw_dom->arch_mbm_total); - hw_dom->arch_mbm_total = NULL; - return -ENOMEM; - } + size_t tsize = sizeof(*hw_dom->arch_mbm_states[0]); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + hw_dom->arch_mbm_states[idx] = kcalloc(num_rmid, tsize, GFP_KERNEL); + if (!hw_dom->arch_mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + for_each_mbm_idx(idx) { + kfree(hw_dom->arch_mbm_states[idx]); + hw_dom->arch_mbm_states[idx] = NULL; + } + + return -ENOMEM; } static int get_domain_id_from_scope(int cpu, enum resctrl_scope scope) diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 5e3c41b36437..58dca892a5df 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -54,15 +54,15 @@ struct rdt_hw_ctrl_domain { * struct rdt_hw_mon_domain - Arch private attributes of a set of CPUs that share * a resource for a monitor function * @d_resctrl: Properties exposed to the resctrl file system - * @arch_mbm_total: arch private state for MBM total bandwidth - * @arch_mbm_local: arch private state for MBM local bandwidth + * @arch_mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct arch_mbm_state + * indexed by RMID on x86. * * Members of this structure are accessed via helpers that provide abstraction. */ struct rdt_hw_mon_domain { struct rdt_mon_domain d_resctrl; - struct arch_mbm_state *arch_mbm_total; - struct arch_mbm_state *arch_mbm_local; + struct arch_mbm_state *arch_mbm_states[QOS_NUM_L3_MBM_EVENTS]; }; static inline struct rdt_hw_ctrl_domain *resctrl_to_arch_ctrl_dom(struct rdt_ctrl_domain *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 07f8ab097cbe..f01db2034d08 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -161,18 +161,14 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do u32 rmid, enum resctrl_event_id eventid) { - switch (eventid) { - case QOS_L3_OCCUP_EVENT_ID: - return NULL; - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &hw_dom->arch_mbm_total[rmid]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &hw_dom->arch_mbm_local[rmid]; - default: - /* Never expect to get here */ - WARN_ON_ONCE(1); + struct arch_mbm_state *state; + + if (!resctrl_is_mbm_event(eventid)) return NULL; - } + + state = hw_dom->arch_mbm_states[MBM_STATE_IDX(eventid)]; + + return state ? &state[rmid] : NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -201,14 +197,16 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain *d) { struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) - memset(hw_dom->arch_mbm_total, 0, - sizeof(*hw_dom->arch_mbm_total) * r->num_rmid); - - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) - memset(hw_dom->arch_mbm_local, 0, - sizeof(*hw_dom->arch_mbm_local) * r->num_rmid); + enum resctrl_event_id eventid; + int idx; + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + memset(hw_dom->arch_mbm_states[idx], 0, + sizeof(*hw_dom->arch_mbm_states[0]) * r->num_rmid); + } } static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index b0b1dcc36795..e0dfa5fb969e 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -346,15 +346,14 @@ static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, u32 rmid, enum resctrl_event_id evtid) { u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct mbm_state *state; - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: + if (!resctrl_is_mbm_event(evtid)) return NULL; - } + + state = d->mbm_states[MBM_STATE_IDX(evtid)]; + + return state ? &state[idx] : NULL; } static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 2ca8e66c0d20..a6047e9345cd 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -127,12 +127,6 @@ static bool resctrl_is_mbm_enabled(void) resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)); } -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - /* * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap * of free CLOSIDs. @@ -4020,9 +4014,13 @@ static void rdtgroup_setup_default(void) static void domain_destroy_mon_state(struct rdt_mon_domain *d) { + int idx; + bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } } void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) @@ -4082,32 +4080,34 @@ void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) { u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; + size_t tsize = sizeof(*d->mbm_states[0]); + enum resctrl_event_id eventid; + int idx; if (resctrl_is_mon_event_enabled(QOS_L3_OCCUP_EVENT_ID)) { d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); if (!d->rmid_busy_llc) return -ENOMEM; } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } - } - if (resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } + + for_each_mbm_event_id(eventid) { + if (!resctrl_is_mon_event_enabled(eventid)) + continue; + idx = MBM_STATE_IDX(eventid); + d->mbm_states[idx] = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_states[idx]) + goto cleanup; } return 0; +cleanup: + bitmap_free(d->rmid_busy_llc); + for_each_mbm_idx(idx) { + kfree(d->mbm_states[idx]); + d->mbm_states[idx] = NULL; + } + + return -ENOMEM; } int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 40aba6b5d4f0..478d7a935ca3 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -161,8 +161,9 @@ struct rdt_ctrl_domain { * @hdr: common header for different domain types * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold - * @mbm_total: saved state for MBM total bandwidth - * @mbm_local: saved state for MBM local bandwidth + * @mbm_states: Per-event pointer to the MBM event's saved state. + * An MBM event's state is an array of struct mbm_state + * indexed by RMID on x86 or combined CLOSID, RMID on Arm. * @mbm_over: worker to periodically read MBM h/w counters * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters @@ -172,8 +173,7 @@ struct rdt_mon_domain { struct rdt_domain_hdr hdr; unsigned int ci_id; unsigned long *rmid_busy_llc; - struct mbm_state *mbm_total; - struct mbm_state *mbm_local; + struct mbm_state *mbm_states[QOS_NUM_L3_MBM_EVENTS]; struct delayed_work mbm_over; struct delayed_work cqm_limbo; int mbm_work_cpu; @@ -376,6 +376,21 @@ bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid); bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt); +static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) +{ + return (eventid >= QOS_L3_MBM_TOTAL_EVENT_ID && + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/* Iterate over all memory bandwidth events */ +#define for_each_mbm_event_id(eventid) \ + for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ + eventid <= QOS_L3_MBM_LOCAL_EVENT_ID; eventid++) + +/* Iterate over memory bandwidth arrays in domain structures */ +#define for_each_mbm_idx(idx) \ + for (idx = 0; idx < QOS_NUM_L3_MBM_EVENTS; idx++) + /** * resctrl_arch_mon_event_config_write() - Write the config for an event. * @config_info: struct resctrl_mon_config_info describing the resource, domain diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index 2dadbc54e4b3..d98351663c2c 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -51,4 +51,7 @@ enum resctrl_event_id { QOS_NUM_EVENTS, }; +#define QOS_NUM_L3_MBM_EVENTS (QOS_L3_MBM_LOCAL_EVENT_ID - QOS_L3_MBM_TOTAL_EVENT_ID + 1) +#define MBM_STATE_IDX(evt) ((evt) - QOS_L3_MBM_TOTAL_EVENT_ID) + #endif /* __LINUX_RESCTRL_TYPES_H */ -- cgit v1.2.3 From 5ad68c8f965fed78c61f2ac7aea933f06bb50032 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:06 -0500 Subject: x86,fs/resctrl: Consolidate monitoring related data from rdt_resource The cache allocation and memory bandwidth allocation feature properties are consolidated into struct resctrl_cache and struct resctrl_membw respectively. In preparation for more monitoring properties that will clobber the existing resource struct more, re-organize the monitoring specific properties to also be in a separate structure. Also convert "bandwidth sources" terminology to "memory transactions" to have consistency within resctrl for related monitoring features. [ bp: Massage commit message. ] Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++-- arch/x86/kernel/cpu/resctrl/monitor.c | 10 +++++----- fs/resctrl/rdtgroup.c | 6 +++--- include/linux/resctrl.h | 18 +++++++++++++----- 4 files changed, 23 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index b07b12a05886..267e9206a999 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -107,7 +107,7 @@ u32 resctrl_arch_system_num_rmid_idx(void) struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl; /* RMID are independent numbers for x86. num_rmid_idx == num_rmid */ - return r->num_rmid; + return r->mon.num_rmid; } struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l) @@ -541,7 +541,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) arch_mon_domain_online(r, d); - if (arch_domain_mbm_alloc(r->num_rmid, hw_dom)) { + if (arch_domain_mbm_alloc(r->mon.num_rmid, hw_dom)) { mon_domain_free(hw_dom); return; } diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f01db2034d08..2558b1bdef8b 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -130,7 +130,7 @@ static int logical_rmid_to_physical_rmid(int cpu, int lrmid) if (snc_nodes_per_l3_cache == 1) return lrmid; - return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->num_rmid; + return lrmid + (cpu_to_node(cpu) % snc_nodes_per_l3_cache) * r->mon.num_rmid; } static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) @@ -205,7 +205,7 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * continue; idx = MBM_STATE_IDX(eventid); memset(hw_dom->arch_mbm_states[idx], 0, - sizeof(*hw_dom->arch_mbm_states[0]) * r->num_rmid); + sizeof(*hw_dom->arch_mbm_states[0]) * r->mon.num_rmid); } } @@ -344,7 +344,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024; hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale / snc_nodes_per_l3_cache; - r->num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; + r->mon.num_rmid = (boot_cpu_data.x86_cache_max_rmid + 1) / snc_nodes_per_l3_cache; hw_res->mbm_width = MBM_CNTR_WIDTH_BASE; if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) @@ -359,7 +359,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) * * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. */ - threshold = resctrl_rmid_realloc_limit / r->num_rmid; + threshold = resctrl_rmid_realloc_limit / r->mon.num_rmid; /* * Because num_rmid may not be a power of two, round the value @@ -373,7 +373,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); - r->mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; + r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } r->mon_capable = true; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index a6047e9345cd..b6ab10704993 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1135,7 +1135,7 @@ static int rdt_num_rmids_show(struct kernfs_open_file *of, { struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - seq_printf(seq, "%d\n", r->num_rmid); + seq_printf(seq, "%d\n", r->mon.num_rmid); return 0; } @@ -1731,9 +1731,9 @@ next: } /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { + if ((val & r->mon.mbm_cfg_mask) != val) { rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); + r->mon.mbm_cfg_mask); return -EINVAL; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 478d7a935ca3..fe2af6cb96d4 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -255,38 +255,46 @@ enum resctrl_schema_fmt { RESCTRL_SCHEMA_RANGE, }; +/** + * struct resctrl_mon - Monitoring related data of a resctrl resource. + * @num_rmid: Number of RMIDs available. + * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth + * monitoring events can be configured. + */ +struct resctrl_mon { + int num_rmid; + unsigned int mbm_cfg_mask; +}; + /** * struct rdt_resource - attributes of a resctrl resource * @rid: The index of the resource * @alloc_capable: Is allocation available on this machine * @mon_capable: Is monitor feature available on this machine - * @num_rmid: Number of RMIDs available * @ctrl_scope: Scope of this resource for control functions * @mon_scope: Scope of this resource for monitor functions * @cache: Cache allocation related data * @membw: If the component has bandwidth controls, their properties. + * @mon: Monitoring related data. * @ctrl_domains: RCU list of all control domains for this resource * @mon_domains: RCU list of all monitor domains for this resource * @name: Name to use in "schemata" file. * @schema_fmt: Which format string and parser is used for this schema. - * @mbm_cfg_mask: Bandwidth sources that can be tracked when bandwidth - * monitoring events can be configured. * @cdp_capable: Is the CDP feature available on this resource */ struct rdt_resource { int rid; bool alloc_capable; bool mon_capable; - int num_rmid; enum resctrl_scope ctrl_scope; enum resctrl_scope mon_scope; struct resctrl_cache cache; struct resctrl_membw membw; + struct resctrl_mon mon; struct list_head ctrl_domains; struct list_head mon_domains; char *name; enum resctrl_schema_fmt schema_fmt; - unsigned int mbm_cfg_mask; bool cdp_capable; }; -- cgit v1.2.3 From 13390861b426e936db20d675804a5b405622bc79 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:07 -0500 Subject: x86,fs/resctrl: Detect Assignable Bandwidth Monitoring feature details ABMC feature details are reported via CPUID Fn8000_0020_EBX_x5. Bits Description 15:0 MAX_ABMC Maximum Supported Assignable Bandwidth Monitoring Counter ID + 1 The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). Detect the feature and number of assignable counters supported. For backward compatibility, upon detecting the assignable counter feature, enable the mbm_total_bytes and mbm_local_bytes events that users are familiar with as part of original L3 MBM support. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/kernel/cpu/resctrl/core.c | 7 +++++-- arch/x86/kernel/cpu/resctrl/monitor.c | 11 ++++++++--- fs/resctrl/monitor.c | 7 +++++++ include/linux/resctrl.h | 4 ++++ 4 files changed, 24 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 267e9206a999..2e68aa02ad3f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -883,6 +883,8 @@ static __init bool get_rdt_mon_resources(void) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); ret = true; } + if (rdt_cpu_has(X86_FEATURE_ABMC)) + ret = true; if (!ret) return false; @@ -978,7 +980,7 @@ static enum cpuhp_state rdt_online; /* Runs once on the BSP during boot. */ void resctrl_cpu_detect(struct cpuinfo_x86 *c) { - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + if (!cpu_has(c, X86_FEATURE_CQM_LLC) && !cpu_has(c, X86_FEATURE_ABMC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; c->x86_cache_mbm_width_offset = -1; @@ -990,7 +992,8 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL) || + cpu_has(c, X86_FEATURE_ABMC)) { u32 eax, ebx, ecx, edx; /* QoS sub-leaf, EAX=0Fh, ECX=1 */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 2558b1bdef8b..0a695ce68f46 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -339,6 +339,7 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); unsigned int threshold; + u32 eax, ebx, ecx, edx; snc_nodes_per_l3_cache = snc_get_config(); @@ -368,14 +369,18 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) */ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold); - if (rdt_cpu_has(X86_FEATURE_BMEC)) { - u32 eax, ebx, ecx, edx; - + if (rdt_cpu_has(X86_FEATURE_BMEC) || rdt_cpu_has(X86_FEATURE_ABMC)) { /* Detect list of bandwidth sources that can be tracked */ cpuid_count(0x80000020, 3, &eax, &ebx, &ecx, &edx); r->mon.mbm_cfg_mask = ecx & MAX_EVT_CONFIG_BITS; } + if (rdt_cpu_has(X86_FEATURE_ABMC)) { + r->mon.mbm_cntr_assignable = true; + cpuid_count(0x80000020, 5, &eax, &ebx, &ecx, &edx); + r->mon.num_mbm_cntrs = (ebx & GENMASK(15, 0)) + 1; + } + r->mon_capable = true; return 0; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index e0dfa5fb969e..b578451de2b5 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -922,6 +922,13 @@ int resctrl_mon_resource_init(void) else if (resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + if (r->mon.mbm_cntr_assignable) { + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_TOTAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); + if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) + resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + } + return 0; } diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index fe2af6cb96d4..eb80cc233be4 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -260,10 +260,14 @@ enum resctrl_schema_fmt { * @num_rmid: Number of RMIDs available. * @mbm_cfg_mask: Memory transactions that can be tracked when bandwidth * monitoring events can be configured. + * @num_mbm_cntrs: Number of assignable counters. + * @mbm_cntr_assignable:Is system capable of supporting counter assignment? */ struct resctrl_mon { int num_rmid; unsigned int mbm_cfg_mask; + int num_mbm_cntrs; + bool mbm_cntr_assignable; }; /** -- cgit v1.2.3 From faebbc58cde9d8f6050ac152c34c88195ed4abaa Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:08 -0500 Subject: x86/resctrl: Add support to enable/disable AMD ABMC feature Add the functionality to enable/disable the AMD ABMC feature. The AMD ABMC feature is enabled by setting enabled bit(0) in the L3_QOS_EXT_CFG MSR. When the state of ABMC is changed, the MSR needs to be updated on all the logical processors in the QOS Domain. Hardware counters will reset when ABMC state is changed. [ bp: Massage commit message. ] Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kernel/cpu/resctrl/internal.h | 5 ++++ arch/x86/kernel/cpu/resctrl/monitor.c | 45 ++++++++++++++++++++++++++++++++++ include/linux/resctrl.h | 20 +++++++++++++++ 4 files changed, 71 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b65c3ba5fa14..e4945e5c92ef 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1223,6 +1223,7 @@ /* - AMD: */ #define MSR_IA32_MBA_BW_BASE 0xc0000200 #define MSR_IA32_SMBA_BW_BASE 0xc0000280 +#define MSR_IA32_L3_QOS_EXT_CFG 0xc00003ff #define MSR_IA32_EVT_CFG_BASE 0xc0000400 /* AMD-V MSRs */ diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 58dca892a5df..a79a487e639c 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -37,6 +37,9 @@ struct arch_mbm_state { u64 prev_msr; }; +/* Setting bit 0 in L3_QOS_EXT_CFG enables the ABMC feature. */ +#define ABMC_ENABLE_BIT 0 + /** * struct rdt_hw_ctrl_domain - Arch private attributes of a set of CPUs that share * a resource for a control function @@ -102,6 +105,7 @@ struct msr_param { * @mon_scale: cqm counter * mon_scale = occupancy in bytes * @mbm_width: Monitor width, to detect and correct for overflow. * @cdp_enabled: CDP state of this resource + * @mbm_cntr_assign_enabled: ABMC feature is enabled * * Members of this structure are either private to the architecture * e.g. mbm_width, or accessed via helpers that provide abstraction. e.g. @@ -115,6 +119,7 @@ struct rdt_hw_resource { unsigned int mon_scale; unsigned int mbm_width; bool cdp_enabled; + bool mbm_cntr_assign_enabled; }; static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 0a695ce68f46..cce35a0ad455 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -399,3 +399,48 @@ void __init intel_rdt_mbm_apply_quirk(void) mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold; mbm_cf = mbm_cf_table[cf_index].cf; } + +static void resctrl_abmc_set_one_amd(void *arg) +{ + bool *enable = arg; + + if (*enable) + msr_set_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); + else + msr_clear_bit(MSR_IA32_L3_QOS_EXT_CFG, ABMC_ENABLE_BIT); +} + +/* + * ABMC enable/disable requires update of L3_QOS_EXT_CFG MSR on all the CPUs + * associated with all monitor domains. + */ +static void _resctrl_abmc_enable(struct rdt_resource *r, bool enable) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + on_each_cpu_mask(&d->hdr.cpu_mask, resctrl_abmc_set_one_amd, + &enable, 1); + resctrl_arch_reset_rmid_all(r, d); + } +} + +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable) +{ + struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r); + + if (r->mon.mbm_cntr_assignable && + hw_res->mbm_cntr_assign_enabled != enable) { + _resctrl_abmc_enable(r, enable); + hw_res->mbm_cntr_assign_enabled = enable; + } + + return 0; +} + +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) +{ + return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; +} diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index eb80cc233be4..919806122c50 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -445,6 +445,26 @@ static inline u32 resctrl_get_config_index(u32 closid, bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l); int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); +/** + * resctrl_arch_mbm_cntr_assign_enabled() - Check if MBM counter assignment + * mode is enabled. + * @r: Pointer to the resource structure. + * + * Return: + * true if the assignment mode is enabled, false otherwise. + */ +bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r); + +/** + * resctrl_arch_mbm_cntr_assign_set() - Configure the MBM counter assignment mode. + * @r: Pointer to the resource structure. + * @enable: Set to true to enable, false to disable the assignment mode. + * + * Return: + * 0 on success, < 0 on error. + */ +int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable); + /* * Update the ctrl_val and apply this config right now. * Must be called on one of the domain's CPUs. -- cgit v1.2.3 From 4d32c24a74f2c12ff440d381ba01de574f6631ce Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:11 -0500 Subject: fs/resctrl: Introduce mbm_cntr_cfg to track assignable counters per domain The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Counters are assigned/unassigned at monitoring domain level. Manage a monitoring domain's hardware counters using a per monitoring domain array of struct mbm_cntr_cfg that is indexed by the hardware counter ID. A hardware counter's configuration contains the MBM event ID and points to the monitoring group that it is assigned to, with a NULL pointer meaning that the hardware counter is available for assignment. There is no direct way to determine which hardware counters are assigned to a particular monitoring group. Check every entry of every hardware counter configuration array in every monitoring domain to query which MBM events of a monitoring group is tracked by hardware. Such queries are acceptable because of a very small number of assignable counters (32 to 64). Suggested-by: Peter Newman Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/rdtgroup.c | 8 ++++++++ include/linux/resctrl.h | 15 +++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 9d95d01da3f9..61f7b68f2273 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -4029,6 +4029,7 @@ static void domain_destroy_mon_state(struct rdt_mon_domain *d) { int idx; + kfree(d->cntr_cfg); bitmap_free(d->rmid_busy_llc); for_each_mbm_idx(idx) { kfree(d->mbm_states[idx]); @@ -4112,6 +4113,13 @@ static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain goto cleanup; } + if (resctrl_is_mbm_enabled() && r->mon.mbm_cntr_assignable) { + tsize = sizeof(*d->cntr_cfg); + d->cntr_cfg = kcalloc(r->mon.num_mbm_cntrs, tsize, GFP_KERNEL); + if (!d->cntr_cfg) + goto cleanup; + } + return 0; cleanup: bitmap_free(d->rmid_busy_llc); diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 919806122c50..e013caba6641 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -156,6 +156,18 @@ struct rdt_ctrl_domain { u32 *mbps_val; }; +/** + * struct mbm_cntr_cfg - Assignable counter configuration. + * @evtid: MBM event to which the counter is assigned. Only valid + * if @rdtgroup is not NULL. + * @rdtgrp: resctrl group assigned to the counter. NULL if the + * counter is free. + */ +struct mbm_cntr_cfg { + enum resctrl_event_id evtid; + struct rdtgroup *rdtgrp; +}; + /** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types @@ -168,6 +180,8 @@ struct rdt_ctrl_domain { * @cqm_limbo: worker to periodically read CQM h/w counters * @mbm_work_cpu: worker CPU for MBM h/w counters * @cqm_work_cpu: worker CPU for CQM h/w counters + * @cntr_cfg: array of assignable counters' configuration (indexed + * by counter ID) */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; @@ -178,6 +192,7 @@ struct rdt_mon_domain { struct delayed_work cqm_limbo; int mbm_work_cpu; int cqm_work_cpu; + struct mbm_cntr_cfg *cntr_cfg; }; /** -- cgit v1.2.3 From ebebda853633de389ba2c6737f8ca38405713e90 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:14 -0500 Subject: fs/resctrl: Introduce event configuration field in struct mon_evt When supported, mbm_event counter assignment mode allows the user to configure events to track specific types of memory transactions. Introduce an evt_cfg field in struct mon_evt to define the type of memory transactions tracked by a monitoring event. Also add a helper function to get the evt_cfg value. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- fs/resctrl/internal.h | 5 +++++ fs/resctrl/monitor.c | 10 ++++++++++ include/linux/resctrl.h | 2 ++ 3 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 4f372e80bf37..1cddfff007a2 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -56,6 +56,10 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) * @evtid: event id * @rid: resource id for this event * @name: name of the event + * @evt_cfg: Event configuration value that represents the + * memory transactions (e.g., READS_TO_LOCAL_MEM, + * READS_TO_REMOTE_MEM) being tracked by @evtid. + * Only valid if @evtid is an MBM event. * @configurable: true if the event is configurable * @enabled: true if the event is enabled */ @@ -63,6 +67,7 @@ struct mon_evt { enum resctrl_event_id evtid; enum resctrl_res_level rid; char *name; + u32 evt_cfg; bool configurable; bool enabled; }; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 1fa82a62b2e5..f714e7baaea6 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -882,6 +882,11 @@ bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid) mon_event_all[eventid].enabled; } +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) +{ + return mon_event_all[evtid].evt_cfg; +} + int resctrl_mbm_assign_mode_show(struct kernfs_open_file *of, struct seq_file *s, void *v) { @@ -1023,6 +1028,11 @@ int resctrl_mon_resource_init(void) resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID); if (!resctrl_is_mon_event_enabled(QOS_L3_MBM_LOCAL_EVENT_ID)) resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID); + mon_event_all[QOS_L3_MBM_TOTAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask; + mon_event_all[QOS_L3_MBM_LOCAL_EVENT_ID].evt_cfg = r->mon.mbm_cfg_mask & + (READS_TO_LOCAL_MEM | + READS_TO_LOCAL_S_MEM | + NON_TEMP_WRITE_TO_LOCAL_MEM); resctrl_file_fflags_init("num_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("available_mbm_cntrs", diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index e013caba6641..87daa4ca312d 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -409,6 +409,8 @@ static inline bool resctrl_is_mbm_event(enum resctrl_event_id eventid) eventid <= QOS_L3_MBM_LOCAL_EVENT_ID); } +u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id eventid); + /* Iterate over all memory bandwidth events */ #define for_each_mbm_event_id(eventid) \ for (eventid = QOS_L3_MBM_TOTAL_EVENT_ID; \ -- cgit v1.2.3 From f7a4fb22312646329ba21bc58958fd83fb9fc15d Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:15 -0500 Subject: x86,fs/resctrl: Implement resctrl_arch_config_cntr() to assign a counter with ABMC The ABMC feature allows users to assign a hardware counter to an RMID, event pair and monitor bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Implement an x86 architecture-specific handler to configure a counter. This architecture specific handler is called by resctrl fs when a counter is assigned or unassigned as well as when an already assigned counter's configuration should be updated. Configure counters by writing to the L3_QOS_ABMC_CFG MSR, specifying the counter ID, bandwidth source (RMID), and event configuration. The ABMC feature details are documented in APM [1] available from [2]. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming Publication # 24593 Revision 3.41 section 19.3.3.3 Assignable Bandwidth Monitoring (ABMC). Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [2] --- arch/x86/kernel/cpu/resctrl/monitor.c | 36 +++++++++++++++++++++++++++++++++++ include/linux/resctrl.h | 19 ++++++++++++++++++ 2 files changed, 55 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index cce35a0ad455..ed295a6c5e66 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -444,3 +444,39 @@ bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r) { return resctrl_to_arch_res(r)->mbm_cntr_assign_enabled; } + +static void resctrl_abmc_config_one_amd(void *info) +{ + union l3_qos_abmc_cfg *abmc_cfg = info; + + wrmsrl(MSR_IA32_L3_QOS_ABMC_CFG, abmc_cfg->full); +} + +/* + * Send an IPI to the domain to assign the counter to RMID, event pair. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign) +{ + struct rdt_hw_mon_domain *hw_dom = resctrl_to_arch_mon_dom(d); + union l3_qos_abmc_cfg abmc_cfg = { 0 }; + struct arch_mbm_state *am; + + abmc_cfg.split.cfg_en = 1; + abmc_cfg.split.cntr_en = assign ? 1 : 0; + abmc_cfg.split.cntr_id = cntr_id; + abmc_cfg.split.bw_src = rmid; + if (assign) + abmc_cfg.split.bw_type = resctrl_get_mon_evt_cfg(evtid); + + smp_call_function_any(&d->hdr.cpu_mask, resctrl_abmc_config_one_amd, &abmc_cfg, 1); + + /* + * The hardware counter is reset (because cfg_en == 1) so there is no + * need to record initial non-zero counts. + */ + am = get_arch_mbm_state(hw_dom, rmid, evtid); + if (am) + memset(am, 0, sizeof(*am)); +} diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 87daa4ca312d..50e38445183a 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -594,6 +594,25 @@ void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_mon_domain * */ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r); +/** + * resctrl_arch_config_cntr() - Configure the counter with its new RMID + * and event details. + * @r: Resource structure. + * @d: The domain in which counter with ID @cntr_id should be configured. + * @evtid: Monitoring event type (e.g., QOS_L3_MBM_TOTAL_EVENT_ID + * or QOS_L3_MBM_LOCAL_EVENT_ID). + * @rmid: RMID. + * @closid: CLOSID. + * @cntr_id: Counter ID to configure. + * @assign: True to assign the counter or update an existing assignment, + * false to unassign the counter. + * + * This can be called from any CPU. + */ +void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + enum resctrl_event_id evtid, u32 rmid, u32 closid, + u32 cntr_id, bool assign); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; -- cgit v1.2.3 From 862314fd1f93d96eddb0559a807c66cb1f6ee520 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:19 -0500 Subject: fs/resctrl: Introduce counter ID read, reset calls in mbm_event mode When supported, "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor the bandwidth usage as long as it is assigned. The hardware continues to track the assigned counter until it is explicitly unassigned by the user. Introduce the architecture calls resctrl_arch_cntr_read() and resctrl_arch_reset_cntr() to read and reset event counters when "mbm_event" mode is supported. Function names match existing resctrl_arch_rmid_read() and resctrl_arch_reset_rmid(). Suggested-by: Reinette Chatre Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- include/linux/resctrl.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 50e38445183a..04152654827d 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -613,6 +613,44 @@ void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, enum resctrl_event_id evtid, u32 rmid, u32 closid, u32 cntr_id, bool assign); +/** + * resctrl_arch_cntr_read() - Read the event data corresponding to the counter ID + * assigned to the RMID, event pair for this resource + * and domain. + * @r: Resource that the counter should be read from. + * @d: Domain that the counter should be read from. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to read. + * @eventid: The MBM event to which @cntr_id is assigned. + * @val: Result of the counter read in bytes. + * + * Called on a CPU that belongs to domain @d when "mbm_event" mode is enabled. + * Called from a non-migrateable process context via smp_call_on_cpu() unless all + * CPUs are nohz_full, in which case it is called via IPI (smp_call_function_any()). + * + * Return: + * 0 on success, or -EIO, -EINVAL etc on error. + */ +int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid, u64 *val); + +/** + * resctrl_arch_reset_cntr() - Reset any private state associated with counter ID. + * @r: The domain's resource. + * @d: The counter ID's domain. + * @closid: CLOSID that matches the RMID. + * @rmid: The RMID to which @cntr_id is assigned. + * @cntr_id: The counter to reset. + * @eventid: The MBM event to which @cntr_id is assigned. + * + * This can be called from any CPU. + */ +void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, int cntr_id, + enum resctrl_event_id eventid); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; -- cgit v1.2.3 From ea274cbeaf8f0667267b347e3f84797439cdab4e Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:23 -0500 Subject: fs/resctrl: Add event configuration directory under info/L3_MON/ The "mbm_event" counter assignment mode allows the user to assign a hardware counter to an RMID, event pair and monitor the bandwidth as long as it is assigned. The user can specify the memory transaction(s) for the counter to track. When this mode is supported, the /sys/fs/resctrl/info/L3_MON/event_configs directory contains a sub-directory for each MBM event that can be assigned to a counter. The MBM event sub-directory contains a file named "event_filter" that is used to view and modify which memory transactions the MBM event is configured with. Create /sys/fs/resctrl/info/L3_MON/event_configs directory on resctrl mount and pre-populate it with directories for the two existing MBM events: mbm_total_bytes and mbm_local_bytes. Create the "event_filter" file within each MBM event directory with the needed *show() that displays the memory transactions with which the MBM event is configured. Example: $ mount -t resctrl resctrl /sys/fs/resctrl $ cd /sys/fs/resctrl/ $ cat info/L3_MON/event_configs/mbm_total_bytes/event_filter local_reads,remote_reads,local_non_temporal_writes, remote_non_temporal_writes,local_reads_slow_memory, remote_reads_slow_memory,dirty_victim_writes_all $ cat info/L3_MON/event_configs/mbm_local_bytes/event_filter local_reads,local_non_temporal_writes,local_reads_slow_memory Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 33 ++++++++++++++++++++ fs/resctrl/internal.h | 4 +++ fs/resctrl/monitor.c | 56 +++++++++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 59 +++++++++++++++++++++++++++++++++-- include/linux/resctrl_types.h | 3 ++ 5 files changed, 153 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 4c24c5f3f4c1..ddd95f1472e6 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -310,6 +310,39 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/available_mbm_cntrs 0=30;1=30 +"event_configs": + Directory that exists when "mbm_event" counter assignment mode is supported. + Contains a sub-directory for each MBM event that can be assigned to a counter. + + Two MBM events are supported by default: mbm_local_bytes and mbm_total_bytes. + Each MBM event's sub-directory contains a file named "event_filter" that is + used to view and modify which memory transactions the MBM event is configured + with. The file is accessible only when "mbm_event" counter assignment mode is + enabled. + + List of memory transaction types supported: + + ========================== ======================================================== + Name Description + ========================== ======================================================== + dirty_victim_writes_all Dirty Victims from the QOS domain to all types of memory + remote_reads_slow_memory Reads to slow memory in the non-local NUMA domain + local_reads_slow_memory Reads to slow memory in the local NUMA domain + remote_non_temporal_writes Non-temporal writes to non-local NUMA domain + local_non_temporal_writes Non-temporal writes to local NUMA domain + remote_reads Reads to memory in the non-local NUMA domain + local_reads Reads to memory in the local NUMA domain + ========================== ======================================================== + + For example:: + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter + local_reads,remote_reads,local_non_temporal_writes,remote_non_temporal_writes, + local_reads_slow_memory,remote_reads_slow_memory,dirty_victim_writes_all + + # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_local_bytes/event_filter + local_reads,local_non_temporal_writes,local_reads_slow_memory + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 2f1f2efe2f40..9bf2e2fd5c19 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -241,6 +241,8 @@ struct rdtgroup { #define RFTYPE_DEBUG BIT(10) +#define RFTYPE_ASSIGN_CONFIG BIT(11) + #define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) #define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) @@ -403,6 +405,8 @@ void rdtgroup_assign_cntrs(struct rdtgroup *rdtgrp); void rdtgroup_unassign_cntrs(struct rdtgroup *rdtgrp); +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index 55327056596e..7179f9865a48 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -972,6 +972,61 @@ u32 resctrl_get_mon_evt_cfg(enum resctrl_event_id evtid) return mon_event_all[evtid].evt_cfg; } +/** + * struct mbm_transaction - Memory transaction an MBM event can be configured with. + * @name: Name of memory transaction (read, write ...). + * @val: The bit (eg. READS_TO_LOCAL_MEM or READS_TO_REMOTE_MEM) used to + * represent the memory transaction within an event's configuration. + */ +struct mbm_transaction { + char name[32]; + u32 val; +}; + +/* Decoded values for each type of memory transaction. */ +static struct mbm_transaction mbm_transactions[NUM_MBM_TRANSACTIONS] = { + {"local_reads", READS_TO_LOCAL_MEM}, + {"remote_reads", READS_TO_REMOTE_MEM}, + {"local_non_temporal_writes", NON_TEMP_WRITE_TO_LOCAL_MEM}, + {"remote_non_temporal_writes", NON_TEMP_WRITE_TO_REMOTE_MEM}, + {"local_reads_slow_memory", READS_TO_LOCAL_S_MEM}, + {"remote_reads_slow_memory", READS_TO_REMOTE_S_MEM}, + {"dirty_victim_writes_all", DIRTY_VICTIMS_TO_ALL_MEM}, +}; + +int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) +{ + struct mon_evt *mevt = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r; + bool sep = false; + int ret = 0, i; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + r = resctrl_arch_get_resource(mevt->rid); + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + for (i = 0; i < NUM_MBM_TRANSACTIONS; i++) { + if (mevt->evt_cfg & mbm_transactions[i].val) { + if (sep) + seq_putc(seq, ','); + seq_printf(seq, "%s", mbm_transactions[i].name); + sep = true; + } + } + seq_putc(seq, '\n'); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + /* * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID * pair in the domain. @@ -1287,6 +1342,7 @@ int resctrl_mon_resource_init(void) RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 2e1d0a2703da..8f0c403e3fb5 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1923,6 +1923,12 @@ static struct rftype res_common_files[] = { .seq_show = mbm_local_bytes_config_show, .write = mbm_local_bytes_config_write, }, + { + .name = "event_filter", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = event_filter_show, + }, { .name = "mbm_assign_mode", .mode = 0444, @@ -2183,10 +2189,48 @@ int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, return ret; } +static int resctrl_mkdir_event_configs(struct rdt_resource *r, struct kernfs_node *l3_mon_kn) +{ + struct kernfs_node *kn_subdir, *kn_subdir2; + struct mon_evt *mevt; + int ret; + + kn_subdir = kernfs_create_dir(l3_mon_kn, "event_configs", l3_mon_kn->mode, NULL); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + for_each_mon_event(mevt) { + if (mevt->rid != r->rid || !mevt->enabled || !resctrl_is_mbm_event(mevt->evtid)) + continue; + + kn_subdir2 = kernfs_create_dir(kn_subdir, mevt->name, kn_subdir->mode, mevt); + if (IS_ERR(kn_subdir2)) { + ret = PTR_ERR(kn_subdir2); + goto out; + } + + ret = rdtgroup_kn_set_ugid(kn_subdir2); + if (ret) + goto out; + + ret = rdtgroup_add_files(kn_subdir2, RFTYPE_ASSIGN_CONFIG); + if (ret) + break; + } + +out: + return ret; +} + static int rdtgroup_mkdir_info_resdir(void *priv, char *name, unsigned long fflags) { struct kernfs_node *kn_subdir; + struct rdt_resource *r; int ret; kn_subdir = kernfs_create_dir(kn_info, name, @@ -2199,8 +2243,19 @@ static int rdtgroup_mkdir_info_resdir(void *priv, char *name, return ret; ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); + if (ret) + return ret; + + if ((fflags & RFTYPE_MON_INFO) == RFTYPE_MON_INFO) { + r = priv; + if (r->mon.mbm_cntr_assignable) { + ret = resctrl_mkdir_event_configs(r, kn_subdir); + if (ret) + return ret; + } + } + + kernfs_activate(kn_subdir); return ret; } diff --git a/include/linux/resctrl_types.h b/include/linux/resctrl_types.h index d98351663c2c..acfe07860b34 100644 --- a/include/linux/resctrl_types.h +++ b/include/linux/resctrl_types.h @@ -34,6 +34,9 @@ /* Max event bits supported */ #define MAX_EVT_CONFIG_BITS GENMASK(6, 0) +/* Number of memory transactions that an MBM event can be configured with */ +#define NUM_MBM_TRANSACTIONS 7 + /* Event IDs */ enum resctrl_event_id { /* Must match value of first event below */ -- cgit v1.2.3 From ac1df9bb0ba3ae94137fb494cd9efc598f65d826 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Fri, 5 Sep 2025 16:34:25 -0500 Subject: fs/resctrl: Introduce mbm_assign_on_mkdir to enable assignments on mkdir The "mbm_event" counter assignment mode allows users to assign a hardware counter to an RMID, event pair and monitor the bandwidth as long as it is assigned. Introduce a user-configurable option that determines if a counter will automatically be assigned to an RMID, event pair when its associated monitor group is created via mkdir. Accessible when "mbm_event" counter assignment mode is enabled. Suggested-by: Peter Newman Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://lore.kernel.org/cover.1757108044.git.babu.moger@amd.com --- Documentation/filesystems/resctrl.rst | 20 +++++++++++++ fs/resctrl/internal.h | 6 ++++ fs/resctrl/monitor.c | 53 +++++++++++++++++++++++++++++++++++ fs/resctrl/rdtgroup.c | 7 +++++ include/linux/resctrl.h | 3 ++ 5 files changed, 89 insertions(+) (limited to 'include/linux') diff --git a/Documentation/filesystems/resctrl.rst b/Documentation/filesystems/resctrl.rst index 2e840ef26f68..1de815b3a07b 100644 --- a/Documentation/filesystems/resctrl.rst +++ b/Documentation/filesystems/resctrl.rst @@ -355,6 +355,26 @@ with the following files: # cat /sys/fs/resctrl/info/L3_MON/event_configs/mbm_total_bytes/event_filter local_reads,local_non_temporal_writes +"mbm_assign_on_mkdir": + Exists when "mbm_event" counter assignment mode is supported. Accessible + only when "mbm_event" counter assignment mode is enabled. + + Determines if a counter will automatically be assigned to an RMID, MBM event + pair when its associated monitor group is created via mkdir. Enabled by default + on boot, also when switched from "default" mode to "mbm_event" counter assignment + mode. Users can disable this capability by writing to the interface. + + "0": + Auto assignment is disabled. + "1": + Auto assignment is enabled. + + Example:: + + # echo 0 > /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + # cat /sys/fs/resctrl/info/L3_MON/mbm_assign_on_mkdir + 0 + "max_threshold_occupancy": Read/write file provides the largest value (in bytes) at which a previously used LLC_occupancy diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 90d3e4ab335b..66c677c1b858 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -410,6 +410,12 @@ int event_filter_show(struct kernfs_open_file *of, struct seq_file *seq, void *v ssize_t event_filter_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off); + #ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index ccb9726bba54..deca9535fbbb 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -1027,6 +1027,57 @@ out_unlock: return ret; } +int resctrl_mbm_assign_on_mkdir_show(struct kernfs_open_file *of, struct seq_file *s, + void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + seq_printf(s, "%u\n", r->mon.mbm_assign_on_mkdir); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +ssize_t resctrl_mbm_assign_on_mkdir_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + bool value; + int ret; + + ret = kstrtobool(buf, &value); + if (ret) + return ret; + + mutex_lock(&rdtgroup_mutex); + rdt_last_cmd_clear(); + + if (!resctrl_arch_mbm_cntr_assign_enabled(r)) { + rdt_last_cmd_puts("mbm_event counter assignment mode is not enabled\n"); + ret = -EINVAL; + goto out_unlock; + } + + r->mon.mbm_assign_on_mkdir = value; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret ?: nbytes; +} + /* * rdtgroup_assign_cntr() - Assign/unassign the counter ID for the event, RMID * pair in the domain. @@ -1457,6 +1508,8 @@ int resctrl_mon_resource_init(void) resctrl_file_fflags_init("available_mbm_cntrs", RFTYPE_MON_INFO | RFTYPE_RES_CACHE); resctrl_file_fflags_init("event_filter", RFTYPE_ASSIGN_CONFIG); + resctrl_file_fflags_init("mbm_assign_on_mkdir", RFTYPE_MON_INFO | + RFTYPE_RES_CACHE); } return 0; diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index e90bc808fe53..c7ea42c2a3c2 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -1808,6 +1808,13 @@ static struct rftype res_common_files[] = { .seq_show = rdt_last_cmd_status_show, .fflags = RFTYPE_TOP_INFO, }, + { + .name = "mbm_assign_on_mkdir", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = resctrl_mbm_assign_on_mkdir_show, + .write = resctrl_mbm_assign_on_mkdir_write, + }, { .name = "num_closids", .mode = 0444, diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 04152654827d..a7d92718b653 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -277,12 +277,15 @@ enum resctrl_schema_fmt { * monitoring events can be configured. * @num_mbm_cntrs: Number of assignable counters. * @mbm_cntr_assignable:Is system capable of supporting counter assignment? + * @mbm_assign_on_mkdir:True if counters should automatically be assigned to MBM + * events of monitor groups created via mkdir. */ struct resctrl_mon { int num_rmid; unsigned int mbm_cfg_mask; int num_mbm_cntrs; bool mbm_cntr_assignable; + bool mbm_assign_on_mkdir; }; /** -- cgit v1.2.3 From 3c17001b21b9f168c957ced9384abe969019b609 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:24 +0200 Subject: pidfs: validate extensible ioctls Validate extensible ioctls stricter than we do now. Reviewed-by: Aleksa Sarai Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/pidfs.c | 2 +- include/linux/fs.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/pidfs.c b/fs/pidfs.c index edc35522d75c..0a5083b9cce5 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd) * erronously mistook the file descriptor for a pidfd. * This is not perfect but will catch most cases. */ - return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); } return false; diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..2f2edc53bf3c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -4023,4 +4023,18 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); +static inline bool extensible_ioctl_valid(unsigned int cmd_a, + unsigned int cmd_b, size_t min_size) +{ + if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b)) + return false; + if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b)) + return false; + if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b)) + return false; + if (_IOC_SIZE(cmd_a) < min_size) + return false; + return true; +} + #endif /* _LINUX_FS_H */ -- cgit v1.2.3 From be975448a45cd024e2b98598eefc0e164ad93f09 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Jul 2025 14:25:35 -0700 Subject: srcu: Document __srcu_read_{,un}lock_fast() implicit RCU readers This commit documents the implicit RCU readers that are implied by the this_cpu_inc() and atomic_long_inc() operations in __srcu_read_lock_fast() and __srcu_read_unlock_fast(). While in the area, fix the documentation of the memory pairing of atomic_long_inc() in __srcu_read_lock_fast(). [ paulmck: Apply Joel Fernandes feedback. ] Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: --- include/linux/srcutree.h | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 4d2fee4d3828..42098e0fa0b7 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -232,9 +232,27 @@ static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ss * srcu_read_unlock_fast(). * * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side - * critical sections either because they disables interrupts, because they - * are a single instruction, or because they are a read-modify-write atomic - * operation, depending on the whims of the architecture. + * critical sections either because they disables interrupts, because + * they are a single instruction, or because they are read-modify-write + * atomic operations, depending on the whims of the architecture. + * This matters because the SRCU-fast grace-period mechanism uses either + * synchronize_rcu() or synchronize_rcu_expedited(), that is, RCU, + * *not* SRCU, in order to eliminate the need for the read-side smp_mb() + * invocations that are used by srcu_read_lock() and srcu_read_unlock(). + * The __srcu_read_unlock_fast() function also relies on this same RCU + * (again, *not* SRCU) trick to eliminate the need for smp_mb(). + * + * The key point behind this RCU trick is that if any part of a given + * RCU reader precedes the beginning of a given RCU grace period, then + * the entirety of that RCU reader and everything preceding it happens + * before the end of that same RCU grace period. Similarly, if any part + * of a given RCU reader follows the end of a given RCU grace period, + * then the entirety of that RCU reader and everything following it + * happens after the beginning of that same RCU grace period. Therefore, + * the operations labeled Y in __srcu_read_lock_fast() and those labeled Z + * in __srcu_read_unlock_fast() are ordered against the corresponding SRCU + * read-side critical section from the viewpoint of the SRCU grace period. + * This is all the ordering that is required, hence no calls to smp_mb(). * * This means that __srcu_read_lock_fast() is not all that fast * on architectures that support NMIs but do not supply NMI-safe @@ -245,9 +263,9 @@ static inline struct srcu_ctr __percpu notrace *__srcu_read_lock_fast(struct src struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) - this_cpu_inc(scp->srcu_locks.counter); /* Y */ + this_cpu_inc(scp->srcu_locks.counter); // Y, and implicit RCU reader. else - atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); /* Z */ + atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); // Y, and implicit RCU reader. barrier(); /* Avoid leaking the critical section. */ return scp; } @@ -258,23 +276,17 @@ static inline struct srcu_ctr __percpu notrace *__srcu_read_lock_fast(struct src * different CPU than that which was incremented by the corresponding * srcu_read_lock_fast(), but it must be within the same task. * - * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side - * critical sections either because they disables interrupts, because they - * are a single instruction, or because they are a read-modify-write atomic - * operation, depending on the whims of the architecture. - * - * This means that __srcu_read_unlock_fast() is not all that fast - * on architectures that support NMIs but do not supply NMI-safe - * implementations of this_cpu_inc(). + * Please see the __srcu_read_lock_fast() function's header comment for + * information on implicit RCU readers and NMI safety. */ static inline void notrace __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) { barrier(); /* Avoid leaking the critical section. */ if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) - this_cpu_inc(scp->srcu_unlocks.counter); /* Z */ + this_cpu_inc(scp->srcu_unlocks.counter); // Z, and implicit RCU reader. else - atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); /* Z */ + atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); // Z, and implicit RCU reader. } void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -- cgit v1.2.3 From 5e0ae59159e3a07391a35865bb79ff335473fa79 Mon Sep 17 00:00:00 2001 From: Angela Czubak Date: Mon, 18 Aug 2025 23:08:42 +0000 Subject: HID: add haptics page defines Introduce haptic usages as defined in HID Usage Tables specification. Add HID units for newton and gram. Signed-off-by: Angela Czubak Co-developed-by: Jonathan Denose Signed-off-by: Jonathan Denose Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 2cc4f1e4ea96..10f113c758fe 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -156,6 +156,7 @@ struct hid_item { #define HID_UP_TELEPHONY 0x000b0000 #define HID_UP_CONSUMER 0x000c0000 #define HID_UP_DIGITIZER 0x000d0000 +#define HID_UP_HAPTIC 0x000e0000 #define HID_UP_PID 0x000f0000 #define HID_UP_BATTERY 0x00850000 #define HID_UP_CAMERA 0x00900000 @@ -316,6 +317,28 @@ struct hid_item { #define HID_DG_TOOLSERIALNUMBER 0x000d005b #define HID_DG_LATENCYMODE 0x000d0060 +#define HID_HP_SIMPLECONTROLLER 0x000e0001 +#define HID_HP_WAVEFORMLIST 0x000e0010 +#define HID_HP_DURATIONLIST 0x000e0011 +#define HID_HP_AUTOTRIGGER 0x000e0020 +#define HID_HP_MANUALTRIGGER 0x000e0021 +#define HID_HP_AUTOTRIGGERASSOCIATEDCONTROL 0x000e0022 +#define HID_HP_INTENSITY 0x000e0023 +#define HID_HP_REPEATCOUNT 0x000e0024 +#define HID_HP_RETRIGGERPERIOD 0x000e0025 +#define HID_HP_WAVEFORMVENDORPAGE 0x000e0026 +#define HID_HP_WAVEFORMVENDORID 0x000e0027 +#define HID_HP_WAVEFORMCUTOFFTIME 0x000e0028 +#define HID_HP_WAVEFORMNONE 0x000e1001 +#define HID_HP_WAVEFORMSTOP 0x000e1002 +#define HID_HP_WAVEFORMCLICK 0x000e1003 +#define HID_HP_WAVEFORMBUZZCONTINUOUS 0x000e1004 +#define HID_HP_WAVEFORMRUMBLECONTINUOUS 0x000e1005 +#define HID_HP_WAVEFORMPRESS 0x000e1006 +#define HID_HP_WAVEFORMRELEASE 0x000e1007 +#define HID_HP_VENDORWAVEFORMMIN 0x000e2001 +#define HID_HP_VENDORWAVEFORMMAX 0x000e2fff + #define HID_BAT_ABSOLUTESTATEOFCHARGE 0x00850065 #define HID_BAT_CHARGING 0x00850044 @@ -423,6 +446,12 @@ struct hid_item { #define HID_REPORT_PROTOCOL 1 #define HID_BOOT_PROTOCOL 0 +/* + * HID units + */ +#define HID_UNIT_GRAM 0x0101 +#define HID_UNIT_NEWTON 0xe111 + /* * This is the global environment of the parser. This information is * persistent for main-items. The global environment can be saved and -- cgit v1.2.3 From 4e584ac737884ef0fd80f6836b917972fad86b17 Mon Sep 17 00:00:00 2001 From: Angela Czubak Date: Mon, 18 Aug 2025 23:08:50 +0000 Subject: Input: MT - add INPUT_MT_TOTAL_FORCE flags Add a flag to generate ABS_PRESSURE as sum of ABS_MT_PRESSURE across all slots. This flag should be set if one knows a device reports true force and would like to report total force to the userspace. Signed-off-by: Angela Czubak Co-developed-by: Jonathan Denose Signed-off-by: Jonathan Denose Acked-by: Dmitry Torokhov Signed-off-by: Benjamin Tissoires --- drivers/input/input-mt.c | 14 ++++++++++---- include/linux/input/mt.h | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c index 337006dd9dcf..09f518897d4a 100644 --- a/drivers/input/input-mt.c +++ b/drivers/input/input-mt.c @@ -198,6 +198,7 @@ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count) struct input_mt *mt = dev->mt; struct input_mt_slot *oldest; int oldid, count, i; + int p, reported_p = 0; if (!mt) return; @@ -216,6 +217,13 @@ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count) oldest = ps; oldid = id; } + if (test_bit(ABS_MT_PRESSURE, dev->absbit)) { + p = input_mt_get_value(ps, ABS_MT_PRESSURE); + if (mt->flags & INPUT_MT_TOTAL_FORCE) + reported_p += p; + else if (oldid == id) + reported_p = p; + } count++; } @@ -245,10 +253,8 @@ void input_mt_report_pointer_emulation(struct input_dev *dev, bool use_count) input_event(dev, EV_ABS, ABS_X, x); input_event(dev, EV_ABS, ABS_Y, y); - if (test_bit(ABS_MT_PRESSURE, dev->absbit)) { - int p = input_mt_get_value(oldest, ABS_MT_PRESSURE); - input_event(dev, EV_ABS, ABS_PRESSURE, p); - } + if (test_bit(ABS_MT_PRESSURE, dev->absbit)) + input_event(dev, EV_ABS, ABS_PRESSURE, reported_p); } else { if (test_bit(ABS_MT_PRESSURE, dev->absbit)) input_event(dev, EV_ABS, ABS_PRESSURE, 0); diff --git a/include/linux/input/mt.h b/include/linux/input/mt.h index 2cf89a538b18..d30286298a00 100644 --- a/include/linux/input/mt.h +++ b/include/linux/input/mt.h @@ -17,6 +17,7 @@ #define INPUT_MT_DROP_UNUSED 0x0004 /* drop contacts not seen in frame */ #define INPUT_MT_TRACK 0x0008 /* use in-kernel tracking */ #define INPUT_MT_SEMI_MT 0x0010 /* semi-mt device, finger count handled manually */ +#define INPUT_MT_TOTAL_FORCE 0x0020 /* calculate total force from slots pressure */ /** * struct input_mt_slot - represents the state of an input MT slot -- cgit v1.2.3 From 8aa1e3a6f0ffbcfdf3bd7d87feb9090f96c54bc4 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:40 -0700 Subject: firmware: qcom: tzmem: export shm_bridge create/delete Anyone with access to contiguous physical memory should be able to share memory with QTEE using shm_bridge. Tested-by: Neil Armstrong Tested-by: Harshal Dev Reviewed-by: Kuldeep Singh Signed-off-by: Amirreza Zarrabi Link: https://lore.kernel.org/r/20250911-qcom-tee-using-tee-ss-without-mem-obj-v12-1-17f07a942b8d@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom/qcom_tzmem.c | 63 ++++++++++++++++++++++++++------ include/linux/firmware/qcom/qcom_tzmem.h | 15 ++++++++ 2 files changed, 67 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/qcom/qcom_tzmem.c b/drivers/firmware/qcom/qcom_tzmem.c index ea0a35355657..186511ced924 100644 --- a/drivers/firmware/qcom/qcom_tzmem.c +++ b/drivers/firmware/qcom/qcom_tzmem.c @@ -109,7 +109,19 @@ notsupp: return 0; } -static int qcom_tzmem_init_area(struct qcom_tzmem_area *area) +/** + * qcom_tzmem_shm_bridge_create() - Create a SHM bridge. + * @paddr: Physical address of the memory to share. + * @size: Size of the memory to share. + * @handle: Handle to the SHM bridge. + * + * On platforms that support SHM bridge, this function creates a SHM bridge + * for the given memory region with QTEE. The handle returned by this function + * must be passed to qcom_tzmem_shm_bridge_delete() to free the SHM bridge. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcom_tzmem_shm_bridge_create(phys_addr_t paddr, size_t size, u64 *handle) { u64 pfn_and_ns_perm, ipfn_and_s_perm, size_and_flags; int ret; @@ -117,17 +129,49 @@ static int qcom_tzmem_init_area(struct qcom_tzmem_area *area) if (!qcom_tzmem_using_shm_bridge) return 0; - pfn_and_ns_perm = (u64)area->paddr | QCOM_SCM_PERM_RW; - ipfn_and_s_perm = (u64)area->paddr | QCOM_SCM_PERM_RW; - size_and_flags = area->size | (1 << QCOM_SHM_BRIDGE_NUM_VM_SHIFT); + pfn_and_ns_perm = paddr | QCOM_SCM_PERM_RW; + ipfn_and_s_perm = paddr | QCOM_SCM_PERM_RW; + size_and_flags = size | (1 << QCOM_SHM_BRIDGE_NUM_VM_SHIFT); + + ret = qcom_scm_shm_bridge_create(pfn_and_ns_perm, ipfn_and_s_perm, + size_and_flags, QCOM_SCM_VMID_HLOS, + handle); + if (ret) { + dev_err(qcom_tzmem_dev, + "SHM Bridge failed: ret %d paddr 0x%pa, size %zu\n", + ret, &paddr, size); + + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(qcom_tzmem_shm_bridge_create); + +/** + * qcom_tzmem_shm_bridge_delete() - Delete a SHM bridge. + * @handle: Handle to the SHM bridge. + * + * On platforms that support SHM bridge, this function deletes the SHM bridge + * for the given memory region. The handle must be the same as the one + * returned by qcom_tzmem_shm_bridge_create(). + */ +void qcom_tzmem_shm_bridge_delete(u64 handle) +{ + if (qcom_tzmem_using_shm_bridge) + qcom_scm_shm_bridge_delete(handle); +} +EXPORT_SYMBOL_GPL(qcom_tzmem_shm_bridge_delete); + +static int qcom_tzmem_init_area(struct qcom_tzmem_area *area) +{ + int ret; u64 *handle __free(kfree) = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) return -ENOMEM; - ret = qcom_scm_shm_bridge_create(pfn_and_ns_perm, ipfn_and_s_perm, - size_and_flags, QCOM_SCM_VMID_HLOS, - handle); + ret = qcom_tzmem_shm_bridge_create(area->paddr, area->size, handle); if (ret) return ret; @@ -140,10 +184,7 @@ static void qcom_tzmem_cleanup_area(struct qcom_tzmem_area *area) { u64 *handle = area->priv; - if (!qcom_tzmem_using_shm_bridge) - return; - - qcom_scm_shm_bridge_delete(*handle); + qcom_tzmem_shm_bridge_delete(*handle); kfree(handle); } diff --git a/include/linux/firmware/qcom/qcom_tzmem.h b/include/linux/firmware/qcom/qcom_tzmem.h index b83b63a0c049..48ac0e5454c7 100644 --- a/include/linux/firmware/qcom/qcom_tzmem.h +++ b/include/linux/firmware/qcom/qcom_tzmem.h @@ -53,4 +53,19 @@ DEFINE_FREE(qcom_tzmem, void *, if (_T) qcom_tzmem_free(_T)) phys_addr_t qcom_tzmem_to_phys(void *ptr); +#if IS_ENABLED(CONFIG_QCOM_TZMEM_MODE_SHMBRIDGE) +int qcom_tzmem_shm_bridge_create(phys_addr_t paddr, size_t size, u64 *handle); +void qcom_tzmem_shm_bridge_delete(u64 handle); +#else +static inline int qcom_tzmem_shm_bridge_create(phys_addr_t paddr, + size_t size, u64 *handle) +{ + return 0; +} + +static inline void qcom_tzmem_shm_bridge_delete(u64 handle) +{ +} +#endif + #endif /* __QCOM_TZMEM */ -- cgit v1.2.3 From 4b700098c0fc4a76c5c1e54465c8f35e13755294 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:41 -0700 Subject: firmware: qcom: scm: add support for object invocation Qualcomm TEE (QTEE) hosts Trusted Applications (TAs) and services in the secure world, accessed via objects. A QTEE client can invoke these objects to request services. Similarly, QTEE can request services from the nonsecure world using objects exported to the secure world. Add low-level primitives to facilitate the invocation of objects hosted in QTEE, as well as those hosted in the nonsecure world. If support for object invocation is available, the qcom_scm allocates a dedicated child platform device. The driver for this device communicates with QTEE using low-level primitives. Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Link: https://lore.kernel.org/r/20250911-qcom-tee-using-tee-ss-without-mem-obj-v12-2-17f07a942b8d@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom/qcom_scm.c | 119 +++++++++++++++++++++++++++++++++ drivers/firmware/qcom/qcom_scm.h | 7 ++ include/linux/firmware/qcom/qcom_scm.h | 6 ++ 3 files changed, 132 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c index 26cd0458aacd..9b5a9a0f68cf 100644 --- a/drivers/firmware/qcom/qcom_scm.c +++ b/drivers/firmware/qcom/qcom_scm.c @@ -2093,6 +2093,122 @@ static int qcom_scm_qseecom_init(struct qcom_scm *scm) #endif /* CONFIG_QCOM_QSEECOM */ +/** + * qcom_scm_qtee_invoke_smc() - Invoke a QTEE object. + * @inbuf: start address of memory area used for inbound buffer. + * @inbuf_size: size of the memory area used for inbound buffer. + * @outbuf: start address of memory area used for outbound buffer. + * @outbuf_size: size of the memory area used for outbound buffer. + * @result: result of QTEE object invocation. + * @response_type: response type returned by QTEE. + * + * @response_type determines how the contents of @inbuf and @outbuf + * should be processed. + * + * Return: On success, return 0 or <0 on failure. + */ +int qcom_scm_qtee_invoke_smc(phys_addr_t inbuf, size_t inbuf_size, + phys_addr_t outbuf, size_t outbuf_size, + u64 *result, u64 *response_type) +{ + struct qcom_scm_desc desc = { + .svc = QCOM_SCM_SVC_SMCINVOKE, + .cmd = QCOM_SCM_SMCINVOKE_INVOKE, + .owner = ARM_SMCCC_OWNER_TRUSTED_OS, + .args[0] = inbuf, + .args[1] = inbuf_size, + .args[2] = outbuf, + .args[3] = outbuf_size, + .arginfo = QCOM_SCM_ARGS(4, QCOM_SCM_RW, QCOM_SCM_VAL, + QCOM_SCM_RW, QCOM_SCM_VAL), + }; + struct qcom_scm_res res; + int ret; + + ret = qcom_scm_call(__scm->dev, &desc, &res); + if (ret) + return ret; + + if (response_type) + *response_type = res.result[0]; + + if (result) + *result = res.result[1]; + + return 0; +} +EXPORT_SYMBOL(qcom_scm_qtee_invoke_smc); + +/** + * qcom_scm_qtee_callback_response() - Submit response for callback request. + * @buf: start address of memory area used for outbound buffer. + * @buf_size: size of the memory area used for outbound buffer. + * @result: Result of QTEE object invocation. + * @response_type: Response type returned by QTEE. + * + * @response_type determines how the contents of @buf should be processed. + * + * Return: On success, return 0 or <0 on failure. + */ +int qcom_scm_qtee_callback_response(phys_addr_t buf, size_t buf_size, + u64 *result, u64 *response_type) +{ + struct qcom_scm_desc desc = { + .svc = QCOM_SCM_SVC_SMCINVOKE, + .cmd = QCOM_SCM_SMCINVOKE_CB_RSP, + .owner = ARM_SMCCC_OWNER_TRUSTED_OS, + .args[0] = buf, + .args[1] = buf_size, + .arginfo = QCOM_SCM_ARGS(2, QCOM_SCM_RW, QCOM_SCM_VAL), + }; + struct qcom_scm_res res; + int ret; + + ret = qcom_scm_call(__scm->dev, &desc, &res); + if (ret) + return ret; + + if (response_type) + *response_type = res.result[0]; + + if (result) + *result = res.result[1]; + + return 0; +} +EXPORT_SYMBOL(qcom_scm_qtee_callback_response); + +static void qcom_scm_qtee_free(void *data) +{ + struct platform_device *qtee_dev = data; + + platform_device_unregister(qtee_dev); +} + +static void qcom_scm_qtee_init(struct qcom_scm *scm) +{ + struct platform_device *qtee_dev; + u64 result, response_type; + int ret; + + /* + * Probe for smcinvoke support. This will fail due to invalid buffers, + * but first, it checks whether the call is supported in QTEE syscall + * handler. If it is not supported, -EIO is returned. + */ + ret = qcom_scm_qtee_invoke_smc(0, 0, 0, 0, &result, &response_type); + if (ret == -EIO) + return; + + /* Setup QTEE interface device. */ + qtee_dev = platform_device_register_data(scm->dev, "qcomtee", + PLATFORM_DEVID_NONE, NULL, 0); + if (IS_ERR(qtee_dev)) + return; + + devm_add_action_or_reset(scm->dev, qcom_scm_qtee_free, qtee_dev); +} + /** * qcom_scm_is_available() - Checks if SCM is available */ @@ -2325,6 +2441,9 @@ static int qcom_scm_probe(struct platform_device *pdev) ret = qcom_scm_qseecom_init(scm); WARN(ret < 0, "failed to initialize qseecom: %d\n", ret); + /* Initialize the QTEE object interface. */ + qcom_scm_qtee_init(scm); + return 0; } diff --git a/drivers/firmware/qcom/qcom_scm.h b/drivers/firmware/qcom/qcom_scm.h index 0e8dd838099e..a56c8212cc0c 100644 --- a/drivers/firmware/qcom/qcom_scm.h +++ b/drivers/firmware/qcom/qcom_scm.h @@ -156,6 +156,13 @@ int qcom_scm_shm_bridge_enable(struct device *scm_dev); #define QCOM_SCM_SVC_GPU 0x28 #define QCOM_SCM_SVC_GPU_INIT_REGS 0x01 +/* ARM_SMCCC_OWNER_TRUSTED_OS calls */ + +#define QCOM_SCM_SVC_SMCINVOKE 0x06 +#define QCOM_SCM_SMCINVOKE_INVOKE_LEGACY 0x00 +#define QCOM_SCM_SMCINVOKE_CB_RSP 0x01 +#define QCOM_SCM_SMCINVOKE_INVOKE 0x02 + /* common error codes */ #define QCOM_SCM_V2_EBUSY -12 #define QCOM_SCM_ENOMEM -5 diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index 0f667bf1d4d9..a55ca771286b 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -175,4 +175,10 @@ static inline int qcom_scm_qseecom_app_send(u32 app_id, #endif /* CONFIG_QCOM_QSEECOM */ +int qcom_scm_qtee_invoke_smc(phys_addr_t inbuf, size_t inbuf_size, + phys_addr_t outbuf, size_t outbuf_size, + u64 *result, u64 *response_type); +int qcom_scm_qtee_callback_response(phys_addr_t buf, size_t buf_size, + u64 *result, u64 *response_type); + #endif -- cgit v1.2.3 From 54fd6bd42e7bd351802ff1d193a2e33e4bfb1836 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:45 +0530 Subject: cdx: Split mcdi.h and reorganize headers Move bitfield.h from the CDX controller directory to include/linux/cdx to make them accessible to other drivers. As part of this refactoring, split mcdi.h into two headers: - mcdi.h: retains interface-level declarations - mcdid.h: contains internal definitions and macros This is in preparation for VersalNET EDAC driver that relies on it. Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Acked-by: Nikhil Agarwal Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com --- drivers/cdx/controller/bitfield.h | 90 ------------ drivers/cdx/controller/cdx_controller.c | 2 +- drivers/cdx/controller/cdx_rpmsg.c | 2 +- drivers/cdx/controller/mcdi.c | 5 +- drivers/cdx/controller/mcdi.h | 242 -------------------------------- drivers/cdx/controller/mcdi_functions.c | 1 - drivers/cdx/controller/mcdi_functions.h | 3 +- drivers/cdx/controller/mcdid.h | 63 +++++++++ include/linux/cdx/bitfield.h | 90 ++++++++++++ include/linux/cdx/mcdi.h | 192 +++++++++++++++++++++++++ 10 files changed, 352 insertions(+), 338 deletions(-) delete mode 100644 drivers/cdx/controller/bitfield.h delete mode 100644 drivers/cdx/controller/mcdi.h create mode 100644 drivers/cdx/controller/mcdid.h create mode 100644 include/linux/cdx/bitfield.h create mode 100644 include/linux/cdx/mcdi.h (limited to 'include/linux') diff --git a/drivers/cdx/controller/bitfield.h b/drivers/cdx/controller/bitfield.h deleted file mode 100644 index 567f8ec47582..000000000000 --- a/drivers/cdx/controller/bitfield.h +++ /dev/null @@ -1,90 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright 2005-2006 Fen Systems Ltd. - * Copyright 2006-2013 Solarflare Communications Inc. - * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. - */ - -#ifndef CDX_BITFIELD_H -#define CDX_BITFIELD_H - -#include - -/* Lowest bit numbers and widths */ -#define CDX_DWORD_LBN 0 -#define CDX_DWORD_WIDTH 32 - -/* Specified attribute (e.g. LBN) of the specified field */ -#define CDX_VAL(field, attribute) field ## _ ## attribute -/* Low bit number of the specified field */ -#define CDX_LOW_BIT(field) CDX_VAL(field, LBN) -/* Bit width of the specified field */ -#define CDX_WIDTH(field) CDX_VAL(field, WIDTH) -/* High bit number of the specified field */ -#define CDX_HIGH_BIT(field) (CDX_LOW_BIT(field) + CDX_WIDTH(field) - 1) - -/* A doubleword (i.e. 4 byte) datatype - little-endian in HW */ -struct cdx_dword { - __le32 cdx_u32; -}; - -/* Value expanders for printk */ -#define CDX_DWORD_VAL(dword) \ - ((unsigned int)le32_to_cpu((dword).cdx_u32)) - -/* - * Extract bit field portion [low,high) from the 32-bit little-endian - * element which contains bits [min,max) - */ -#define CDX_DWORD_FIELD(dword, field) \ - (FIELD_GET(GENMASK(CDX_HIGH_BIT(field), CDX_LOW_BIT(field)), \ - le32_to_cpu((dword).cdx_u32))) - -/* - * Creates the portion of the named bit field that lies within the - * range [min,max). - */ -#define CDX_INSERT_FIELD(field, value) \ - (FIELD_PREP(GENMASK(CDX_HIGH_BIT(field), \ - CDX_LOW_BIT(field)), value)) - -/* - * Creates the portion of the named bit fields that lie within the - * range [min,max). - */ -#define CDX_INSERT_FIELDS(field1, value1, \ - field2, value2, \ - field3, value3, \ - field4, value4, \ - field5, value5, \ - field6, value6, \ - field7, value7) \ - (CDX_INSERT_FIELD(field1, (value1)) | \ - CDX_INSERT_FIELD(field2, (value2)) | \ - CDX_INSERT_FIELD(field3, (value3)) | \ - CDX_INSERT_FIELD(field4, (value4)) | \ - CDX_INSERT_FIELD(field5, (value5)) | \ - CDX_INSERT_FIELD(field6, (value6)) | \ - CDX_INSERT_FIELD(field7, (value7))) - -#define CDX_POPULATE_DWORD(dword, ...) \ - (dword).cdx_u32 = cpu_to_le32(CDX_INSERT_FIELDS(__VA_ARGS__)) - -/* Populate a dword field with various numbers of arguments */ -#define CDX_POPULATE_DWORD_7 CDX_POPULATE_DWORD -#define CDX_POPULATE_DWORD_6(dword, ...) \ - CDX_POPULATE_DWORD_7(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_POPULATE_DWORD_5(dword, ...) \ - CDX_POPULATE_DWORD_6(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_POPULATE_DWORD_4(dword, ...) \ - CDX_POPULATE_DWORD_5(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_POPULATE_DWORD_3(dword, ...) \ - CDX_POPULATE_DWORD_4(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_POPULATE_DWORD_2(dword, ...) \ - CDX_POPULATE_DWORD_3(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_POPULATE_DWORD_1(dword, ...) \ - CDX_POPULATE_DWORD_2(dword, CDX_DWORD, 0, __VA_ARGS__) -#define CDX_SET_DWORD(dword) \ - CDX_POPULATE_DWORD_1(dword, CDX_DWORD, 0xffffffff) - -#endif /* CDX_BITFIELD_H */ diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c index fca83141e3e6..3f8b9041babf 100644 --- a/drivers/cdx/controller/cdx_controller.c +++ b/drivers/cdx/controller/cdx_controller.c @@ -14,7 +14,7 @@ #include "cdx_controller.h" #include "../cdx.h" #include "mcdi_functions.h" -#include "mcdi.h" +#include "mcdid.h" static unsigned int cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) { diff --git a/drivers/cdx/controller/cdx_rpmsg.c b/drivers/cdx/controller/cdx_rpmsg.c index 04b578a0be17..d4f763323aac 100644 --- a/drivers/cdx/controller/cdx_rpmsg.c +++ b/drivers/cdx/controller/cdx_rpmsg.c @@ -15,7 +15,7 @@ #include "../cdx.h" #include "cdx_controller.h" #include "mcdi_functions.h" -#include "mcdi.h" +#include "mcdid.h" static struct rpmsg_device_id cdx_rpmsg_id_table[] = { { .name = "mcdi_ipc" }, diff --git a/drivers/cdx/controller/mcdi.c b/drivers/cdx/controller/mcdi.c index e760f8d347cc..90bf9f7c257b 100644 --- a/drivers/cdx/controller/mcdi.c +++ b/drivers/cdx/controller/mcdi.c @@ -23,9 +23,10 @@ #include #include #include +#include -#include "bitfield.h" -#include "mcdi.h" +#include +#include "mcdid.h" static void cdx_mcdi_cancel_cmd(struct cdx_mcdi *cdx, struct cdx_mcdi_cmd *cmd); static void cdx_mcdi_wait_for_cleanup(struct cdx_mcdi *cdx); diff --git a/drivers/cdx/controller/mcdi.h b/drivers/cdx/controller/mcdi.h deleted file mode 100644 index 54a65e9760ae..000000000000 --- a/drivers/cdx/controller/mcdi.h +++ /dev/null @@ -1,242 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright 2008-2013 Solarflare Communications Inc. - * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. - */ - -#ifndef CDX_MCDI_H -#define CDX_MCDI_H - -#include -#include -#include - -#include "bitfield.h" -#include "mc_cdx_pcol.h" - -#ifdef DEBUG -#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x) -#define CDX_WARN_ON_PARANOID(x) WARN_ON(x) -#else -#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0) -#define CDX_WARN_ON_PARANOID(x) do {} while (0) -#endif - -/** - * enum cdx_mcdi_mode - MCDI transaction mode - * @MCDI_MODE_EVENTS: wait for an mcdi response callback. - * @MCDI_MODE_FAIL: we think MCDI is dead, so fail-fast all calls - */ -enum cdx_mcdi_mode { - MCDI_MODE_EVENTS, - MCDI_MODE_FAIL, -}; - -#define MCDI_RPC_TIMEOUT (10 * HZ) -#define MCDI_RPC_LONG_TIMEOU (60 * HZ) -#define MCDI_RPC_POST_RST_TIME (10 * HZ) - -#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX) - -/** - * enum cdx_mcdi_cmd_state - State for an individual MCDI command - * @MCDI_STATE_QUEUED: Command not started and is waiting to run. - * @MCDI_STATE_RETRY: Command was submitted and MC rejected with no resources, - * as MC have too many outstanding commands. Command will be retried once - * another command returns. - * @MCDI_STATE_RUNNING: Command was accepted and is running. - * @MCDI_STATE_RUNNING_CANCELLED: Command is running but the issuer cancelled - * the command. - * @MCDI_STATE_FINISHED: Processing of this command has completed. - */ - -enum cdx_mcdi_cmd_state { - MCDI_STATE_QUEUED, - MCDI_STATE_RETRY, - MCDI_STATE_RUNNING, - MCDI_STATE_RUNNING_CANCELLED, - MCDI_STATE_FINISHED, -}; - -/** - * struct cdx_mcdi - CDX MCDI Firmware interface, to interact - * with CDX controller. - * @mcdi: MCDI interface - * @mcdi_ops: MCDI operations - * @r5_rproc : R5 Remoteproc device handle - * @rpdev: RPMsg device - * @ept: RPMsg endpoint - * @work: Post probe work - */ -struct cdx_mcdi { - /* MCDI interface */ - struct cdx_mcdi_data *mcdi; - const struct cdx_mcdi_ops *mcdi_ops; - - struct rproc *r5_rproc; - struct rpmsg_device *rpdev; - struct rpmsg_endpoint *ept; - struct work_struct work; -}; - -struct cdx_mcdi_ops { - void (*mcdi_request)(struct cdx_mcdi *cdx, - const struct cdx_dword *hdr, size_t hdr_len, - const struct cdx_dword *sdu, size_t sdu_len); - unsigned int (*mcdi_rpc_timeout)(struct cdx_mcdi *cdx, unsigned int cmd); -}; - -typedef void cdx_mcdi_async_completer(struct cdx_mcdi *cdx, - unsigned long cookie, int rc, - struct cdx_dword *outbuf, - size_t outlen_actual); - -/** - * struct cdx_mcdi_cmd - An outstanding MCDI command - * @ref: Reference count. There will be one reference if the command is - * in the mcdi_iface cmd_list, another if it's on a cleanup list, - * and a third if it's queued in the work queue. - * @list: The data for this entry in mcdi->cmd_list - * @cleanup_list: The data for this entry in a cleanup list - * @work: The work item for this command, queued in mcdi->workqueue - * @mcdi: The mcdi_iface for this command - * @state: The state of this command - * @inlen: inbuf length - * @inbuf: Input buffer - * @quiet: Whether to silence errors - * @reboot_seen: Whether a reboot has been seen during this command, - * to prevent duplicates - * @seq: Sequence number - * @started: Jiffies this command was started at - * @cookie: Context for completion function - * @completer: Completion function - * @handle: Command handle - * @cmd: Command number - * @rc: Return code - * @outlen: Length of output buffer - * @outbuf: Output buffer - */ -struct cdx_mcdi_cmd { - struct kref ref; - struct list_head list; - struct list_head cleanup_list; - struct work_struct work; - struct cdx_mcdi_iface *mcdi; - enum cdx_mcdi_cmd_state state; - size_t inlen; - const struct cdx_dword *inbuf; - bool quiet; - bool reboot_seen; - u8 seq; - unsigned long started; - unsigned long cookie; - cdx_mcdi_async_completer *completer; - unsigned int handle; - unsigned int cmd; - int rc; - size_t outlen; - struct cdx_dword *outbuf; - /* followed by inbuf data if necessary */ -}; - -/** - * struct cdx_mcdi_iface - MCDI protocol context - * @cdx: The associated NIC - * @iface_lock: Serialise access to this structure - * @outstanding_cleanups: Count of cleanups - * @cmd_list: List of outstanding and running commands - * @workqueue: Workqueue used for delayed processing - * @cmd_complete_wq: Waitqueue for command completion - * @db_held_by: Command the MC doorbell is in use by - * @seq_held_by: Command each sequence number is in use by - * @prev_handle: The last used command handle - * @mode: Poll for mcdi completion, or wait for an mcdi_event - * @prev_seq: The last used sequence number - * @new_epoch: Indicates start of day or start of MC reboot recovery - */ -struct cdx_mcdi_iface { - struct cdx_mcdi *cdx; - /* Serialise access */ - struct mutex iface_lock; - unsigned int outstanding_cleanups; - struct list_head cmd_list; - struct workqueue_struct *workqueue; - wait_queue_head_t cmd_complete_wq; - struct cdx_mcdi_cmd *db_held_by; - struct cdx_mcdi_cmd *seq_held_by[16]; - unsigned int prev_handle; - enum cdx_mcdi_mode mode; - u8 prev_seq; - bool new_epoch; -}; - -/** - * struct cdx_mcdi_data - extra state for NICs that implement MCDI - * @iface: Interface/protocol state - * @fn_flags: Flags for this function, as returned by %MC_CMD_DRV_ATTACH. - */ -struct cdx_mcdi_data { - struct cdx_mcdi_iface iface; - u32 fn_flags; -}; - -static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx) -{ - return cdx->mcdi ? &cdx->mcdi->iface : NULL; -} - -int cdx_mcdi_init(struct cdx_mcdi *cdx); -void cdx_mcdi_finish(struct cdx_mcdi *cdx); - -void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len); -int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, - const struct cdx_dword *inbuf, size_t inlen, - struct cdx_dword *outbuf, size_t outlen, size_t *outlen_actual); -int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd, - const struct cdx_dword *inbuf, size_t inlen, - cdx_mcdi_async_completer *complete, - unsigned long cookie); -int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, - unsigned int timeout_jiffies); - -/* - * We expect that 16- and 32-bit fields in MCDI requests and responses - * are appropriately aligned, but 64-bit fields are only - * 32-bit-aligned. - */ -#define MCDI_DECLARE_BUF(_name, _len) struct cdx_dword _name[DIV_ROUND_UP(_len, 4)] = {{0}} -#define _MCDI_PTR(_buf, _offset) \ - ((u8 *)(_buf) + (_offset)) -#define MCDI_PTR(_buf, _field) \ - _MCDI_PTR(_buf, MC_CMD_ ## _field ## _OFST) -#define _MCDI_CHECK_ALIGN(_ofst, _align) \ - ((void)BUILD_BUG_ON_ZERO((_ofst) & ((_align) - 1)), \ - (_ofst)) -#define _MCDI_DWORD(_buf, _field) \ - ((_buf) + (_MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, 4) >> 2)) - -#define MCDI_BYTE(_buf, _field) \ - ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1), \ - *MCDI_PTR(_buf, _field)) -#define MCDI_WORD(_buf, _field) \ - ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2), \ - le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field))) -#define MCDI_SET_DWORD(_buf, _field, _value) \ - CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), CDX_DWORD, _value) -#define MCDI_DWORD(_buf, _field) \ - CDX_DWORD_FIELD(*_MCDI_DWORD(_buf, _field), CDX_DWORD) -#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1) \ - CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), \ - MC_CMD_ ## _name1, _value1) -#define MCDI_SET_QWORD(_buf, _field, _value) \ - do { \ - CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0], \ - CDX_DWORD, (u32)(_value)); \ - CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1], \ - CDX_DWORD, (u64)(_value) >> 32); \ - } while (0) -#define MCDI_QWORD(_buf, _field) \ - (CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) | \ - (u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32) - -#endif /* CDX_MCDI_H */ diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c index 885c69e6ebe5..8ae2d99be81e 100644 --- a/drivers/cdx/controller/mcdi_functions.c +++ b/drivers/cdx/controller/mcdi_functions.c @@ -5,7 +5,6 @@ #include -#include "mcdi.h" #include "mcdi_functions.h" int cdx_mcdi_get_num_buses(struct cdx_mcdi *cdx) diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h index b9942affdc6b..57fd1bae706b 100644 --- a/drivers/cdx/controller/mcdi_functions.h +++ b/drivers/cdx/controller/mcdi_functions.h @@ -8,7 +8,8 @@ #ifndef CDX_MCDI_FUNCTIONS_H #define CDX_MCDI_FUNCTIONS_H -#include "mcdi.h" +#include +#include "mcdid.h" #include "../cdx.h" /** diff --git a/drivers/cdx/controller/mcdid.h b/drivers/cdx/controller/mcdid.h new file mode 100644 index 000000000000..7fc29f099265 --- /dev/null +++ b/drivers/cdx/controller/mcdid.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2008-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2025, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_MCDID_H +#define CDX_MCDID_H + +#include +#include +#include + +#include "mc_cdx_pcol.h" + +#ifdef DEBUG +#define CDX_WARN_ON_ONCE_PARANOID(x) WARN_ON_ONCE(x) +#define CDX_WARN_ON_PARANOID(x) WARN_ON(x) +#else +#define CDX_WARN_ON_ONCE_PARANOID(x) do {} while (0) +#define CDX_WARN_ON_PARANOID(x) do {} while (0) +#endif + +#define MCDI_BUF_LEN (8 + MCDI_CTL_SDU_LEN_MAX) + +static inline struct cdx_mcdi_iface *cdx_mcdi_if(struct cdx_mcdi *cdx) +{ + return cdx->mcdi ? &cdx->mcdi->iface : NULL; +} + +int cdx_mcdi_rpc_async(struct cdx_mcdi *cdx, unsigned int cmd, + const struct cdx_dword *inbuf, size_t inlen, + cdx_mcdi_async_completer *complete, + unsigned long cookie); +int cdx_mcdi_wait_for_quiescence(struct cdx_mcdi *cdx, + unsigned int timeout_jiffies); + +/* + * We expect that 16- and 32-bit fields in MCDI requests and responses + * are appropriately aligned, but 64-bit fields are only + * 32-bit-aligned. + */ +#define MCDI_BYTE(_buf, _field) \ + ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 1), \ + *MCDI_PTR(_buf, _field)) +#define MCDI_WORD(_buf, _field) \ + ((void)BUILD_BUG_ON_ZERO(MC_CMD_ ## _field ## _LEN != 2), \ + le16_to_cpu(*(__force const __le16 *)MCDI_PTR(_buf, _field))) +#define MCDI_POPULATE_DWORD_1(_buf, _field, _name1, _value1) \ + CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), \ + MC_CMD_ ## _name1, _value1) +#define MCDI_SET_QWORD(_buf, _field, _value) \ + do { \ + CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[0], \ + CDX_DWORD, (u32)(_value)); \ + CDX_POPULATE_DWORD_1(_MCDI_DWORD(_buf, _field)[1], \ + CDX_DWORD, (u64)(_value) >> 32); \ + } while (0) +#define MCDI_QWORD(_buf, _field) \ + (CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[0], CDX_DWORD) | \ + (u64)CDX_DWORD_FIELD(_MCDI_DWORD(_buf, _field)[1], CDX_DWORD) << 32) + +#endif /* CDX_MCDID_H */ diff --git a/include/linux/cdx/bitfield.h b/include/linux/cdx/bitfield.h new file mode 100644 index 000000000000..567f8ec47582 --- /dev/null +++ b/include/linux/cdx/bitfield.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2005-2006 Fen Systems Ltd. + * Copyright 2006-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_BITFIELD_H +#define CDX_BITFIELD_H + +#include + +/* Lowest bit numbers and widths */ +#define CDX_DWORD_LBN 0 +#define CDX_DWORD_WIDTH 32 + +/* Specified attribute (e.g. LBN) of the specified field */ +#define CDX_VAL(field, attribute) field ## _ ## attribute +/* Low bit number of the specified field */ +#define CDX_LOW_BIT(field) CDX_VAL(field, LBN) +/* Bit width of the specified field */ +#define CDX_WIDTH(field) CDX_VAL(field, WIDTH) +/* High bit number of the specified field */ +#define CDX_HIGH_BIT(field) (CDX_LOW_BIT(field) + CDX_WIDTH(field) - 1) + +/* A doubleword (i.e. 4 byte) datatype - little-endian in HW */ +struct cdx_dword { + __le32 cdx_u32; +}; + +/* Value expanders for printk */ +#define CDX_DWORD_VAL(dword) \ + ((unsigned int)le32_to_cpu((dword).cdx_u32)) + +/* + * Extract bit field portion [low,high) from the 32-bit little-endian + * element which contains bits [min,max) + */ +#define CDX_DWORD_FIELD(dword, field) \ + (FIELD_GET(GENMASK(CDX_HIGH_BIT(field), CDX_LOW_BIT(field)), \ + le32_to_cpu((dword).cdx_u32))) + +/* + * Creates the portion of the named bit field that lies within the + * range [min,max). + */ +#define CDX_INSERT_FIELD(field, value) \ + (FIELD_PREP(GENMASK(CDX_HIGH_BIT(field), \ + CDX_LOW_BIT(field)), value)) + +/* + * Creates the portion of the named bit fields that lie within the + * range [min,max). + */ +#define CDX_INSERT_FIELDS(field1, value1, \ + field2, value2, \ + field3, value3, \ + field4, value4, \ + field5, value5, \ + field6, value6, \ + field7, value7) \ + (CDX_INSERT_FIELD(field1, (value1)) | \ + CDX_INSERT_FIELD(field2, (value2)) | \ + CDX_INSERT_FIELD(field3, (value3)) | \ + CDX_INSERT_FIELD(field4, (value4)) | \ + CDX_INSERT_FIELD(field5, (value5)) | \ + CDX_INSERT_FIELD(field6, (value6)) | \ + CDX_INSERT_FIELD(field7, (value7))) + +#define CDX_POPULATE_DWORD(dword, ...) \ + (dword).cdx_u32 = cpu_to_le32(CDX_INSERT_FIELDS(__VA_ARGS__)) + +/* Populate a dword field with various numbers of arguments */ +#define CDX_POPULATE_DWORD_7 CDX_POPULATE_DWORD +#define CDX_POPULATE_DWORD_6(dword, ...) \ + CDX_POPULATE_DWORD_7(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_5(dword, ...) \ + CDX_POPULATE_DWORD_6(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_4(dword, ...) \ + CDX_POPULATE_DWORD_5(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_3(dword, ...) \ + CDX_POPULATE_DWORD_4(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_2(dword, ...) \ + CDX_POPULATE_DWORD_3(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_POPULATE_DWORD_1(dword, ...) \ + CDX_POPULATE_DWORD_2(dword, CDX_DWORD, 0, __VA_ARGS__) +#define CDX_SET_DWORD(dword) \ + CDX_POPULATE_DWORD_1(dword, CDX_DWORD, 0xffffffff) + +#endif /* CDX_BITFIELD_H */ diff --git a/include/linux/cdx/mcdi.h b/include/linux/cdx/mcdi.h new file mode 100644 index 000000000000..46e3f63b062a --- /dev/null +++ b/include/linux/cdx/mcdi.h @@ -0,0 +1,192 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright 2008-2013 Solarflare Communications Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef CDX_MCDI_H +#define CDX_MCDI_H + +#include +#include +#include + +#include "linux/cdx/bitfield.h" + +/** + * enum cdx_mcdi_mode - MCDI transaction mode + * @MCDI_MODE_EVENTS: wait for an mcdi response callback. + * @MCDI_MODE_FAIL: we think MCDI is dead, so fail-fast all calls + */ +enum cdx_mcdi_mode { + MCDI_MODE_EVENTS, + MCDI_MODE_FAIL, +}; + +#define MCDI_RPC_TIMEOUT (10 * HZ) +#define MCDI_RPC_LONG_TIMEOU (60 * HZ) +#define MCDI_RPC_POST_RST_TIME (10 * HZ) + +/** + * enum cdx_mcdi_cmd_state - State for an individual MCDI command + * @MCDI_STATE_QUEUED: Command not started and is waiting to run. + * @MCDI_STATE_RETRY: Command was submitted and MC rejected with no resources, + * as MC have too many outstanding commands. Command will be retried once + * another command returns. + * @MCDI_STATE_RUNNING: Command was accepted and is running. + * @MCDI_STATE_RUNNING_CANCELLED: Command is running but the issuer cancelled + * the command. + * @MCDI_STATE_FINISHED: Processing of this command has completed. + */ + +enum cdx_mcdi_cmd_state { + MCDI_STATE_QUEUED, + MCDI_STATE_RETRY, + MCDI_STATE_RUNNING, + MCDI_STATE_RUNNING_CANCELLED, + MCDI_STATE_FINISHED, +}; + +/** + * struct cdx_mcdi - CDX MCDI Firmware interface, to interact + * with CDX controller. + * @mcdi: MCDI interface + * @mcdi_ops: MCDI operations + * @r5_rproc : R5 Remoteproc device handle + * @rpdev: RPMsg device + * @ept: RPMsg endpoint + * @work: Post probe work + */ +struct cdx_mcdi { + /* MCDI interface */ + struct cdx_mcdi_data *mcdi; + const struct cdx_mcdi_ops *mcdi_ops; + + struct rproc *r5_rproc; + struct rpmsg_device *rpdev; + struct rpmsg_endpoint *ept; + struct work_struct work; +}; + +struct cdx_mcdi_ops { + void (*mcdi_request)(struct cdx_mcdi *cdx, + const struct cdx_dword *hdr, size_t hdr_len, + const struct cdx_dword *sdu, size_t sdu_len); + unsigned int (*mcdi_rpc_timeout)(struct cdx_mcdi *cdx, unsigned int cmd); +}; + +typedef void cdx_mcdi_async_completer(struct cdx_mcdi *cdx, + unsigned long cookie, int rc, + struct cdx_dword *outbuf, + size_t outlen_actual); + +/** + * struct cdx_mcdi_cmd - An outstanding MCDI command + * @ref: Reference count. There will be one reference if the command is + * in the mcdi_iface cmd_list, another if it's on a cleanup list, + * and a third if it's queued in the work queue. + * @list: The data for this entry in mcdi->cmd_list + * @cleanup_list: The data for this entry in a cleanup list + * @work: The work item for this command, queued in mcdi->workqueue + * @mcdi: The mcdi_iface for this command + * @state: The state of this command + * @inlen: inbuf length + * @inbuf: Input buffer + * @quiet: Whether to silence errors + * @reboot_seen: Whether a reboot has been seen during this command, + * to prevent duplicates + * @seq: Sequence number + * @started: Jiffies this command was started at + * @cookie: Context for completion function + * @completer: Completion function + * @handle: Command handle + * @cmd: Command number + * @rc: Return code + * @outlen: Length of output buffer + * @outbuf: Output buffer + */ +struct cdx_mcdi_cmd { + struct kref ref; + struct list_head list; + struct list_head cleanup_list; + struct work_struct work; + struct cdx_mcdi_iface *mcdi; + enum cdx_mcdi_cmd_state state; + size_t inlen; + const struct cdx_dword *inbuf; + bool quiet; + bool reboot_seen; + u8 seq; + unsigned long started; + unsigned long cookie; + cdx_mcdi_async_completer *completer; + unsigned int handle; + unsigned int cmd; + int rc; + size_t outlen; + struct cdx_dword *outbuf; + /* followed by inbuf data if necessary */ +}; + +/** + * struct cdx_mcdi_iface - MCDI protocol context + * @cdx: The associated NIC + * @iface_lock: Serialise access to this structure + * @outstanding_cleanups: Count of cleanups + * @cmd_list: List of outstanding and running commands + * @workqueue: Workqueue used for delayed processing + * @cmd_complete_wq: Waitqueue for command completion + * @db_held_by: Command the MC doorbell is in use by + * @seq_held_by: Command each sequence number is in use by + * @prev_handle: The last used command handle + * @mode: Poll for mcdi completion, or wait for an mcdi_event + * @prev_seq: The last used sequence number + * @new_epoch: Indicates start of day or start of MC reboot recovery + */ +struct cdx_mcdi_iface { + struct cdx_mcdi *cdx; + /* Serialise access */ + struct mutex iface_lock; + unsigned int outstanding_cleanups; + struct list_head cmd_list; + struct workqueue_struct *workqueue; + wait_queue_head_t cmd_complete_wq; + struct cdx_mcdi_cmd *db_held_by; + struct cdx_mcdi_cmd *seq_held_by[16]; + unsigned int prev_handle; + enum cdx_mcdi_mode mode; + u8 prev_seq; + bool new_epoch; +}; + +/** + * struct cdx_mcdi_data - extra state for NICs that implement MCDI + * @iface: Interface/protocol state + * @fn_flags: Flags for this function, as returned by %MC_CMD_DRV_ATTACH. + */ +struct cdx_mcdi_data { + struct cdx_mcdi_iface iface; + u32 fn_flags; +}; + +/* + * We expect that 16- and 32-bit fields in MCDI requests and responses + * are appropriately aligned, but 64-bit fields are only + * 32-bit-aligned. + */ +#define MCDI_DECLARE_BUF(_name, _len) struct cdx_dword _name[DIV_ROUND_UP(_len, 4)] = {{0}} +#define _MCDI_PTR(_buf, _offset) \ + ((u8 *)(_buf) + (_offset)) +#define MCDI_PTR(_buf, _field) \ + _MCDI_PTR(_buf, MC_CMD_ ## _field ## _OFST) +#define _MCDI_CHECK_ALIGN(_ofst, _align) \ + ((void)BUILD_BUG_ON_ZERO((_ofst) & ((_align) - 1)), \ + (_ofst)) +#define _MCDI_DWORD(_buf, _field) \ + ((_buf) + (_MCDI_CHECK_ALIGN(MC_CMD_ ## _field ## _OFST, 4) >> 2)) + +#define MCDI_SET_DWORD(_buf, _field, _value) \ + CDX_POPULATE_DWORD_1(*_MCDI_DWORD(_buf, _field), CDX_DWORD, _value) +#define MCDI_DWORD(_buf, _field) \ + CDX_DWORD_FIELD(*_MCDI_DWORD(_buf, _field), CDX_DWORD) +#endif /* CDX_MCDI_H */ -- cgit v1.2.3 From f99b3917789d83ea89b24b722d784956f8289f45 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 15 Sep 2025 14:57:29 +0200 Subject: fs: rename generic_delete_inode() and generic_drop_inode() generic_delete_inode() is rather misleading for what the routine is doing. inode_just_drop() should be much clearer. The new naming is inconsistent with generic_drop_inode(), so rename that one as well with inode_ as the suffix. No functional changes. Signed-off-by: Mateusz Guzik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 4 ++-- Documentation/filesystems/vfs.rst | 4 ++-- block/bdev.c | 2 +- drivers/dax/super.c | 2 +- drivers/misc/ibmasm/ibmasmfs.c | 2 +- drivers/usb/gadget/function/f_fs.c | 2 +- drivers/usb/gadget/legacy/inode.c | 2 +- fs/9p/vfs_super.c | 2 +- fs/afs/inode.c | 4 ++-- fs/btrfs/inode.c | 2 +- fs/ceph/super.c | 2 +- fs/configfs/mount.c | 2 +- fs/efivarfs/super.c | 2 +- fs/ext4/super.c | 2 +- fs/f2fs/super.c | 2 +- fs/fuse/inode.c | 2 +- fs/gfs2/super.c | 2 +- fs/hostfs/hostfs_kern.c | 2 +- fs/inode.c | 6 +++--- fs/kernfs/mount.c | 2 +- fs/nfs/inode.c | 2 +- fs/ocfs2/dlmfs/dlmfs.c | 2 +- fs/orangefs/super.c | 2 +- fs/overlayfs/super.c | 2 +- fs/pidfs.c | 2 +- fs/proc/inode.c | 2 +- fs/pstore/inode.c | 2 +- fs/ramfs/inode.c | 2 +- fs/smb/client/cifsfs.c | 2 +- fs/ubifs/super.c | 2 +- fs/xfs/xfs_super.c | 2 +- include/linux/fs.h | 4 ++-- kernel/bpf/inode.c | 2 +- mm/shmem.c | 2 +- 34 files changed, 40 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f07..b5db45c0094c 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -340,8 +340,8 @@ of those. Caller makes sure async writeback cannot be running for the inode whil ->drop_inode() returns int now; it's called on final iput() with inode->i_lock held and it returns true if filesystems wants the inode to be -dropped. As before, generic_drop_inode() is still the default and it's been -updated appropriately. generic_delete_inode() is also alive and it consists +dropped. As before, inode_generic_drop() is still the default and it's been +updated appropriately. inode_just_drop() is also alive and it consists simply of return 1. Note that all actual eviction work is done by caller after ->drop_inode() returns. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 486a91633474..7a314eee6305 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -327,11 +327,11 @@ or bottom half). inode->i_lock spinlock held. This method should be either NULL (normal UNIX filesystem - semantics) or "generic_delete_inode" (for filesystems that do + semantics) or "inode_just_drop" (for filesystems that do not want to cache inodes - causing "delete_inode" to always be called regardless of the value of i_nlink) - The "generic_delete_inode()" behavior is equivalent to the old + The "inode_just_drop()" behavior is equivalent to the old practice of using "force_delete" in the put_inode() case, but does not have the races that the "force_delete()" approach had. diff --git a/block/bdev.c b/block/bdev.c index b77ddd12dc06..810707cca970 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -412,7 +412,7 @@ static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = bdev_evict_inode, }; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 54c480e874cb..d7714d8afb0f 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -388,7 +388,7 @@ static const struct super_operations dax_sops = { .alloc_inode = dax_alloc_inode, .destroy_inode = dax_destroy_inode, .free_inode = dax_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static int dax_init_fs_context(struct fs_context *fc) diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index c44de892a61e..5372ed2a363e 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -94,7 +94,7 @@ static int ibmasmfs_init_fs_context(struct fs_context *fc) static const struct super_operations ibmasmfs_s_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations; diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 08a251df20c4..5246fa6af3d6 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1891,7 +1891,7 @@ static struct dentry *ffs_sb_create_file(struct super_block *sb, /* Super block */ static const struct super_operations ffs_sb_operations = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; struct ffs_sb_fill_data { diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index b51e132b0cd2..13c3da49348c 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -2011,7 +2011,7 @@ gadgetfs_create_file (struct super_block *sb, char const *name, static const struct super_operations gadget_fs_operations = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static int diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 795c6388744c..1581ebac5bb4 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) - return generic_drop_inode(inode); + return inode_generic_drop(inode); /* * in case of non cached mode always drop the * inode because we want the inode attribute diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e9538e91f848..e1cb17b85791 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode) _enter(""); if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) - return generic_delete_inode(inode); + return inode_just_drop(inode); else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b77dd22b8cdb..2081654dea5c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7953,7 +7953,7 @@ int btrfs_drop_inode(struct inode *inode) if (btrfs_root_refs(&root->root_item) == 0) return 1; else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } static void init_once(void *foo) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3eb651862c5..70dc9467f6a0 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 740f18b60c9d..456c4a2efb53 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode) static const struct super_operations configfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .free_inode = configfs_free_inode, }; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index c4a139911356..6f131910be9d 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb); static const struct super_operations efivarfs_ops = { .statfs = efivarfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .alloc_inode = efivarfs_alloc_inode, .free_inode = efivarfs_free_inode, .show_options = efivarfs_show_options, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7d39da7e733..848e059d58cf 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) static int ext4_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c2..63cf73409da6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1768,7 +1768,7 @@ static int f2fs_drop_inode(struct inode *inode) trace_f2fs_drop_inode(inode, 0); return 0; } - ret = generic_drop_inode(inode); + ret = inode_generic_drop(inode); if (!ret) ret = fscrypt_drop_inode(inode); trace_f2fs_drop_inode(inode, ret); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ecb869e895ab..63390575b6fd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1209,7 +1209,7 @@ static const struct super_operations fuse_super_operations = { .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b42e2110084b..644b2d1e7276 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(SDF_EVICTING, &sdp->sd_flags)) return 1; - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /** diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 01e516175bcd..1e1acf5775ab 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations hostfs_sbops = { .alloc_inode = hostfs_alloc_inode, .free_inode = hostfs_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = hostfs_evict_inode, .statfs = hostfs_statfs, .show_options = hostfs_show_options, diff --git a/fs/inode.c b/fs/inode.c index 7b81d4a101b8..a66c02123bca 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1838,11 +1838,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, EXPORT_SYMBOL(insert_inode_locked4); -int generic_delete_inode(struct inode *inode) +int inode_just_drop(struct inode *inode) { return 1; } -EXPORT_SYMBOL(generic_delete_inode); +EXPORT_SYMBOL(inode_just_drop); /* * Called when we're dropping the last reference @@ -1866,7 +1866,7 @@ static void iput_final(struct inode *inode) if (op->drop_inode) drop = op->drop_inode(inode); else - drop = generic_drop_inode(inode); + drop = inode_generic_drop(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e384a69fbece..76eaf64b9d9e 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) const struct super_operations kernfs_sops = { .statfs = kernfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 338ef77ae423..cbd8a7f9c617 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_drop_inode(struct inode *inode) { - return NFS_STALE(inode) || generic_drop_inode(inode); + return NFS_STALE(inode) || inode_generic_drop(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 5130ec44e5e1..807e2b758a5c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = { .alloc_inode = dlmfs_alloc_inode, .free_inode = dlmfs_free_inode, .evict_inode = dlmfs_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct inode_operations dlmfs_file_inode_operations = { diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index f3da840758e7..b46100a4f529 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = { .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .write_inode = orangefs_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .statfs = orangefs_statfs, .show_options = orangefs_show_options, }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index df85a76597e9..bd3d7ba8fb95 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = { .alloc_inode = ovl_alloc_inode, .free_inode = ovl_free_inode, .destroy_inode = ovl_destroy_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = ovl_put_super, .sync_fs = ovl_sync_fs, .statfs = ovl_statfs, diff --git a/fs/pidfs.c b/fs/pidfs.c index edc35522d75c..3865272178a8 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode) } static const struct super_operations pidfs_sops = { - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 129490151be1..d9b7ef122343 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 1a2e1185426c..b4e55c90f8dc 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc) static const struct super_operations pstore_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pstore_evict_inode, .show_options = pstore_show_options, }; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index f8874c3b8c1e..41f9995da7ca 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations ramfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = ramfs_show_options, }; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 3bd85ab2deb1..b4075385a649 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode) /* no serverino => unconditional eviction */ return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || - generic_drop_inode(inode); + inode_generic_drop(inode); } static const struct super_operations cifs_super_ops = { diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b2068608..733fd1e5a9a2 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) static int ubifs_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a77..a05ff68748dc 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -778,7 +778,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode); + return inode_generic_drop(inode); } STATIC void diff --git a/include/linux/fs.h b/include/linux/fs.h index 4daf9b30a641..724b9af67f35 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3312,8 +3312,8 @@ extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); -extern int generic_delete_inode(struct inode *inode); -static inline int generic_drop_inode(struct inode *inode) +extern int inode_just_drop(struct inode *inode); +static inline int inode_generic_drop(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392..6d021d18afa6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode) const struct super_operations bpf_super_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = bpf_show_options, .free_inode = bpf_free_inode, }; diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b..932727247c64 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5341,7 +5341,7 @@ static const struct super_operations shmem_ops = { .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, -- cgit v1.2.3 From 8b0d03129b6165bbf8c9494897489c6da6fadd58 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:46 +0530 Subject: cdx: Export Symbols for MCDI RPC and Initialization The cdx_mcdi_init(), cdx_mcdi_process_cmd(), and cdx_mcdi_rpc() functions are needed by the VersalNET EDAC module that interact with the MCDI (Management Controller Direct Interface) framework. These functions facilitate communication between different hardware components by enabling command execution and status management. Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Acked-by: Nikhil Agarwal Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com --- drivers/cdx/controller/mcdi.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/cdx/mcdi.h | 7 +++++++ 2 files changed, 45 insertions(+) (limited to 'include/linux') diff --git a/drivers/cdx/controller/mcdi.c b/drivers/cdx/controller/mcdi.c index 90bf9f7c257b..2e82ffc18d89 100644 --- a/drivers/cdx/controller/mcdi.c +++ b/drivers/cdx/controller/mcdi.c @@ -100,6 +100,19 @@ static unsigned long cdx_mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd return cdx->mcdi_ops->mcdi_rpc_timeout(cdx, cmd); } +/** + * cdx_mcdi_init - Initialize MCDI (Management Controller Driver Interface) state + * @cdx: Handle to the CDX MCDI structure + * + * This function allocates and initializes internal MCDI structures and resources + * for the CDX device, including the workqueue, locking primitives, and command + * tracking mechanisms. It sets the initial operating mode and prepares the device + * for MCDI operations. + * + * Return: + * * 0 - on success + * * -ENOMEM - if memory allocation or workqueue creation fails + */ int cdx_mcdi_init(struct cdx_mcdi *cdx) { struct cdx_mcdi_iface *mcdi; @@ -129,7 +142,16 @@ fail2: fail: return rc; } +EXPORT_SYMBOL_GPL(cdx_mcdi_init); +/** + * cdx_mcdi_finish - Cleanup MCDI (Management Controller Driver Interface) state + * @cdx: Handle to the CDX MCDI structure + * + * This function is responsible for cleaning up the MCDI (Management Controller Driver Interface) + * resources associated with a cdx_mcdi structure. Also destroys the mcdi workqueue. + * + */ void cdx_mcdi_finish(struct cdx_mcdi *cdx) { struct cdx_mcdi_iface *mcdi; @@ -144,6 +166,7 @@ void cdx_mcdi_finish(struct cdx_mcdi *cdx) kfree(cdx->mcdi); cdx->mcdi = NULL; } +EXPORT_SYMBOL_GPL(cdx_mcdi_finish); static bool cdx_mcdi_flushed(struct cdx_mcdi_iface *mcdi, bool ignore_cleanups) { @@ -554,6 +577,19 @@ static void cdx_mcdi_start_or_queue(struct cdx_mcdi_iface *mcdi, cdx_mcdi_cmd_start_or_queue(mcdi, cmd); } +/** + * cdx_mcdi_process_cmd - Process an incoming MCDI response + * @cdx: Handle to the CDX MCDI structure + * @outbuf: Pointer to the response buffer received from the management controller + * @len: Length of the response buffer in bytes + * + * This function handles a response from the management controller. It locates the + * corresponding command using the sequence number embedded in the header, + * completes the command if it is still pending, and initiates any necessary cleanup. + * + * The function assumes that the response buffer is well-formed and at least one + * dword in size. + */ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len) { struct cdx_mcdi_iface *mcdi; @@ -591,6 +627,7 @@ void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int le cdx_mcdi_process_cleanup_list(mcdi->cdx, &cleanup_list); } +EXPORT_SYMBOL_GPL(cdx_mcdi_process_cmd); static void cdx_mcdi_cmd_work(struct work_struct *context) { @@ -758,6 +795,7 @@ int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, return cdx_mcdi_rpc_sync(cdx, cmd, inbuf, inlen, outbuf, outlen, outlen_actual, false); } +EXPORT_SYMBOL_GPL(cdx_mcdi_rpc); /** * cdx_mcdi_rpc_async - Schedule an MCDI command to run asynchronously diff --git a/include/linux/cdx/mcdi.h b/include/linux/cdx/mcdi.h index 46e3f63b062a..74075305cba4 100644 --- a/include/linux/cdx/mcdi.h +++ b/include/linux/cdx/mcdi.h @@ -169,6 +169,13 @@ struct cdx_mcdi_data { u32 fn_flags; }; +void cdx_mcdi_finish(struct cdx_mcdi *cdx); +int cdx_mcdi_init(struct cdx_mcdi *cdx); +void cdx_mcdi_process_cmd(struct cdx_mcdi *cdx, struct cdx_dword *outbuf, int len); +int cdx_mcdi_rpc(struct cdx_mcdi *cdx, unsigned int cmd, + const struct cdx_dword *inbuf, size_t inlen, + struct cdx_dword *outbuf, size_t outlen, size_t *outlen_actual); + /* * We expect that 16- and 32-bit fields in MCDI requests and responses * are appropriately aligned, but 64-bit fields are only -- cgit v1.2.3 From d5fe2fec6c40dda03df8cc9b4a97de0b7e39f984 Mon Sep 17 00:00:00 2001 From: Shubhrajyoti Datta Date: Mon, 8 Sep 2025 17:26:49 +0530 Subject: EDAC: Add a driver for the AMD Versal NET DDR controller Add a driver for the AMD Versal NET DDR memory controller which supports single bit error correction, double bit error detection and other system errors from various IP subsystems (e.g., RPU, NOCs, HNICX, PL). The driver listens for notifications from the NMC (Network management controller) using RPMsg (Remote Processor Messaging). The channel used for communicating to RPMsg is named "error_edac". Upon receipt of a notification, the driver sends a RAS event trace. [ bp: - Fixup title - Rewrite commit message - Fixup Kconfig text - Zap unused defines and align them - Simplify rpmsg_cb() considerably - Drop silly double-brackets in conditionals - Use proper void * type in mcdi_request() - Do not clear chinfo in rpmsg_probe() unnecessarily - Fix indentation - Do a proper err unwind path in init_versalnet() - Redo the error unwind path in mc_probe() properly - Fix the ordering in mc_remove() ] Signed-off-by: Shubhrajyoti Datta Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250908115649.22903-1-shubhrajyoti.datta@amd.com Link: https://lore.kernel.org/r/20250703173105.GLaGa-WQCESDNsqygm@fat_crate.local --- MAINTAINERS | 7 + drivers/edac/Kconfig | 8 + drivers/edac/Makefile | 1 + drivers/edac/versalnet_edac.c | 958 ++++++++++++++++++++++++++++++++++++++ include/linux/cdx/edac_cdx_pcol.h | 28 ++ 5 files changed, 1002 insertions(+) create mode 100644 drivers/edac/versalnet_edac.c create mode 100644 include/linux/cdx/edac_cdx_pcol.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index ac20587a3e98..23254e89a1f8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27631,6 +27631,13 @@ S: Maintained F: Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml F: drivers/edac/versal_edac.c +XILINX VERSALNET EDAC DRIVER +M: Shubhrajyoti Datta +S: Maintained +F: Documentation/devicetree/bindings/memory-controllers/xlnx,versal-net-ddrmc5.yaml +F: drivers/edac/versalnet_edac.c +F: include/linux/cdx/edac_cdx_pcol.h + XILINX WATCHDOG DRIVER M: Srinivas Neeli R: Shubhrajyoti Datta diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index b824472208c4..39352b9b7a7e 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -584,4 +584,12 @@ config EDAC_CORTEX_A72 The detected and reported errors are from reading CPU/L2 memory error syndrome registers. +config EDAC_VERSALNET + tristate "AMD VersalNET DDR Controller" + depends on CDX_CONTROLLER && ARCH_ZYNQMP + help + Support for single bit error correction, double bit error detection + and other system errors from various IP subsystems like RPU, NOCs, + HNICX, PL on the AMD Versal NET DDR memory controller. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index c1736f264320..1c14796410a3 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -88,4 +88,5 @@ obj-$(CONFIG_EDAC_NPCM) += npcm_edac.o obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o obj-$(CONFIG_EDAC_VERSAL) += versal_edac.o obj-$(CONFIG_EDAC_LOONGSON) += loongson_edac.o +obj-$(CONFIG_EDAC_VERSALNET) += versalnet_edac.o obj-$(CONFIG_EDAC_CORTEX_A72) += a72_edac.o diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c new file mode 100644 index 000000000000..66714fffa591 --- /dev/null +++ b/drivers/edac/versalnet_edac.c @@ -0,0 +1,958 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Versal NET memory controller driver + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "edac_module.h" + +/* Granularity of reported error in bytes */ +#define MC5_ERR_GRAIN 1 +#define MC_GET_DDR_CONFIG_IN_LEN 4 + +#define MC5_IRQ_CE_MASK GENMASK(18, 15) +#define MC5_IRQ_UE_MASK GENMASK(14, 11) + +#define MC5_RANK_1_MASK GENMASK(11, 6) +#define MASK_24 GENMASK(29, 24) +#define MASK_0 GENMASK(5, 0) + +#define MC5_LRANK_1_MASK GENMASK(11, 6) +#define MC5_LRANK_2_MASK GENMASK(17, 12) +#define MC5_BANK1_MASK GENMASK(11, 6) +#define MC5_GRP_0_MASK GENMASK(17, 12) +#define MC5_GRP_1_MASK GENMASK(23, 18) + +#define MC5_REGHI_ROW 7 +#define MC5_EACHBIT 1 +#define MC5_ERR_TYPE_CE 0 +#define MC5_ERR_TYPE_UE 1 +#define MC5_HIGH_MEM_EN BIT(20) +#define MC5_MEM_MASK GENMASK(19, 0) +#define MC5_X16_BASE 256 +#define MC5_X16_ECC 32 +#define MC5_X16_SIZE (MC5_X16_BASE + MC5_X16_ECC) +#define MC5_X32_SIZE 576 +#define MC5_HIMEM_BASE (256 * SZ_1M) +#define MC5_ILC_HIMEM_EN BIT(28) +#define MC5_ILC_MEM GENMASK(27, 0) +#define MC5_INTERLEAVE_SEL GENMASK(3, 0) +#define MC5_BUS_WIDTH_MASK GENMASK(19, 18) +#define MC5_NUM_CHANS_MASK BIT(17) +#define MC5_RANK_MASK GENMASK(15, 14) + +#define ERROR_LEVEL 2 +#define ERROR_ID 3 +#define TOTAL_ERR_LENGTH 5 +#define MSG_ERR_OFFSET 8 +#define MSG_ERR_LENGTH 9 +#define ERROR_DATA 10 +#define MCDI_RESPONSE 0xFF + +#define REG_MAX 152 +#define ADEC_MAX 152 +#define NUM_CONTROLLERS 8 +#define REGS_PER_CONTROLLER 19 +#define ADEC_NUM 19 +#define BUFFER_SZ 80 + +#define XDDR5_BUS_WIDTH_64 0 +#define XDDR5_BUS_WIDTH_32 1 +#define XDDR5_BUS_WIDTH_16 2 + +/** + * struct ecc_error_info - ECC error log information. + * @burstpos: Burst position. + * @lrank: Logical Rank number. + * @rank: Rank number. + * @group: Group number. + * @bank: Bank number. + * @col: Column number. + * @row: Row number. + * @rowhi: Row number higher bits. + * @i: Combined ECC error vector containing encoded values of burst position, + * rank, bank, column, and row information. + */ +union ecc_error_info { + struct { + u32 burstpos:3; + u32 lrank:4; + u32 rank:2; + u32 group:3; + u32 bank:2; + u32 col:11; + u32 row:7; + u32 rowhi; + }; + u64 i; +} __packed; + +/* Row and column bit positions in the address decoder (ADEC) registers. */ +union row_col_mapping { + struct { + u32 row0:6; + u32 row1:6; + u32 row2:6; + u32 row3:6; + u32 row4:6; + u32 reserved:2; + }; + struct { + u32 col1:6; + u32 col2:6; + u32 col3:6; + u32 col4:6; + u32 col5:6; + u32 reservedcol:2; + }; + u32 i; +} __packed; + +/** + * struct ecc_status - ECC status information to report. + * @ceinfo: Correctable errors. + * @ueinfo: Uncorrected errors. + * @channel: Channel number. + * @error_type: Error type. + */ +struct ecc_status { + union ecc_error_info ceinfo[2]; + union ecc_error_info ueinfo[2]; + u8 channel; + u8 error_type; +}; + +/** + * struct mc_priv - DDR memory controller private instance data. + * @message: Buffer for framing the event specific info. + * @stat: ECC status information. + * @error_id: The error id. + * @error_level: The error level. + * @dwidth: Width of data bus excluding ECC bits. + * @part_len: The support of the message received. + * @regs: The registers sent on the rpmsg. + * @adec: Address decode registers. + * @mci: Memory controller interface. + * @ept: rpmsg endpoint. + * @mcdi: The mcdi handle. + */ +struct mc_priv { + char message[256]; + struct ecc_status stat; + u32 error_id; + u32 error_level; + u32 dwidth; + u32 part_len; + u32 regs[REG_MAX]; + u32 adec[ADEC_MAX]; + struct mem_ctl_info *mci[NUM_CONTROLLERS]; + struct rpmsg_endpoint *ept; + struct cdx_mcdi *mcdi; +}; + +/* + * Address decoder (ADEC) registers to match the order in which the register + * information is received from the firmware. + */ +enum adec_info { + CONF = 0, + ADEC0, + ADEC1, + ADEC2, + ADEC3, + ADEC4, + ADEC5, + ADEC6, + ADEC7, + ADEC8, + ADEC9, + ADEC10, + ADEC11, + ADEC12, + ADEC13, + ADEC14, + ADEC15, + ADEC16, + ADECILC, +}; + +enum reg_info { + ISR = 0, + IMR, + ECCR0_ERR_STATUS, + ECCR0_ADDR_LO, + ECCR0_ADDR_HI, + ECCR0_DATA_LO, + ECCR0_DATA_HI, + ECCR0_PAR, + ECCR1_ERR_STATUS, + ECCR1_ADDR_LO, + ECCR1_ADDR_HI, + ECCR1_DATA_LO, + ECCR1_DATA_HI, + ECCR1_PAR, + XMPU_ERR, + XMPU_ERR_ADDR_L0, + XMPU_ERR_ADDR_HI, + XMPU_ERR_AXI_ID, + ADEC_CHK_ERR_LOG, +}; + +static bool get_ddr_info(u32 *error_data, struct mc_priv *priv) +{ + u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr; + struct ecc_status *p; + + isr = error_data[ISR]; + + if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK))) + return false; + + eccr0_val = error_data[ECCR0_ERR_STATUS]; + eccr1_val = error_data[ECCR1_ERR_STATUS]; + + if (!eccr0_val && !eccr1_val) + return false; + + p = &priv->stat; + + if (!eccr0_val) + p->channel = 1; + else + p->channel = 0; + + reglo = error_data[ECCR0_ADDR_LO]; + reghi = error_data[ECCR0_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[0].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[0].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR0_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + reglo = error_data[ECCR1_ADDR_LO]; + reghi = error_data[ECCR1_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[1].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[1].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR1_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + return true; +} + +/** + * convert_to_physical - Convert @error_data to a physical address. + * @priv: DDR memory controller private instance data. + * @pinf: ECC error info structure. + * @controller: Controller number of the MC5 + * @error_data: the DDRMC5 ADEC address decoder register data + * + * Return: physical address of the DDR memory. + */ +static unsigned long convert_to_physical(struct mc_priv *priv, + union ecc_error_info pinf, + int controller, int *error_data) +{ + u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset; + u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base; + unsigned long err_addr = 0, addr; + union row_col_mapping cols; + union row_col_mapping rows; + u32 col_bit_0; + + row = pinf.rowhi << MC5_REGHI_ROW | pinf.row; + offset = controller * ADEC_NUM; + + reg = error_data[ADEC6]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC7]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC8]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + + reg = error_data[ADEC9]; + rows.i = reg; + + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + + col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]); + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << col_bit_0; + + cols.i = error_data[ADEC10]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + cols.i = error_data[ADEC11]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + reg = error_data[ADEC12]; + err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0); + pinf.bank >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg); + pinf.bank >>= MC5_EACHBIT; + + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.group >>= MC5_EACHBIT; + + reg = error_data[ADEC4]; + err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0); + pinf.rank >>= MC5_EACHBIT; + err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg); + pinf.rank >>= MC5_EACHBIT; + + reg = error_data[ADEC5]; + err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.lrank >>= MC5_EACHBIT; + + high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE; + interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL; + + high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK; + low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK; + reg = priv->adec[ADEC14 + offset]; + ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN); + ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M; + if (ilc_himem_en) + ilc_base_ctrl_add = ilcmem_base - high_mem_offset; + else + ilc_base_ctrl_add = ilcmem_base - low_mem_offset; + + if (priv->dwidth == DEV_X16) { + blk = err_addr / MC5_X16_SIZE; + rsh_req_addr = (blk << 8) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } else { + blk = err_addr / MC5_X32_SIZE; + rsh_req_addr = (blk << 9) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } + + if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base) + addr = err_addr - high_mem_offset; + else + addr = err_addr - low_mem_offset; + + return addr; +} + +/** + * handle_error - Handle errors. + * @priv: DDR memory controller private instance data. + * @stat: ECC status structure. + * @ctl_num: Controller number of the MC5 + * @error_data: the MC5 ADEC address decoder register data + * + * Handles ECC correctable and uncorrectable errors. + */ +static void handle_error(struct mc_priv *priv, struct ecc_status *stat, + int ctl_num, int *error_data) +{ + union ecc_error_info pinf; + struct mem_ctl_info *mci; + unsigned long pa; + phys_addr_t pfn; + int err; + + if (WARN_ON_ONCE(ctl_num > NUM_CONTROLLERS)) + return; + + mci = priv->mci[ctl_num]; + + if (stat->error_type == MC5_ERR_TYPE_CE) { + pinf = stat->ceinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s Controller %d Addr at %lx\n", + "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + } + + if (stat->error_type == MC5_ERR_TYPE_UE) { + pinf = stat->ueinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s controller %d Addr at %lx\n", + "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + pa = convert_to_physical(priv, pinf, ctl_num, error_data); + pfn = PHYS_PFN(pa); + + if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) { + err = memory_failure(pfn, MF_ACTION_REQUIRED); + if (err) + edac_dbg(2, "memory_failure() error: %d", err); + else + edac_dbg(2, "Poison page at PA 0x%lx\n", pa); + } + } +} + +static void mc_init(struct mem_ctl_info *mci, struct device *dev) +{ + struct mc_priv *priv = mci->pvt_info; + struct csrow_info *csi; + struct dimm_info *dimm; + u32 row; + int ch; + + /* Initialize controller capabilities and configuration */ + mci->mtype_cap = MEM_FLAG_DDR5; + mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED; + mci->scrub_cap = SCRUB_HW_SRC; + mci->scrub_mode = SCRUB_NONE; + + mci->edac_cap = EDAC_FLAG_SECDED; + mci->ctl_name = "VersalNET DDR5"; + mci->dev_name = dev_name(dev); + mci->mod_name = "versalnet_edac"; + + edac_op_state = EDAC_OPSTATE_INT; + + for (row = 0; row < mci->nr_csrows; row++) { + csi = mci->csrows[row]; + for (ch = 0; ch < csi->nr_channels; ch++) { + dimm = csi->channels[ch]->dimm; + dimm->edac_mode = EDAC_SECDED; + dimm->mtype = MEM_DDR5; + dimm->grain = MC5_ERR_GRAIN; + dimm->dtype = priv->dwidth; + } + } +} + +#define to_mci(k) container_of(k, struct mem_ctl_info, dev) + +static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) +{ + return MCDI_RPC_TIMEOUT; +} + +static void mcdi_request(struct cdx_mcdi *cdx, + const struct cdx_dword *hdr, size_t hdr_len, + const struct cdx_dword *sdu, size_t sdu_len) +{ + void *send_buf; + int ret; + + send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL); + if (!send_buf) + return; + + memcpy(send_buf, hdr, hdr_len); + memcpy(send_buf + hdr_len, sdu, sdu_len); + + ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len); + if (ret) + dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret); + + kfree(send_buf); +} + +static const struct cdx_mcdi_ops mcdi_ops = { + .mcdi_rpc_timeout = mcdi_rpc_timeout, + .mcdi_request = mcdi_request, +}; + +static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi) +{ + size_t outlen; + int ret; + + MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN); + MCDI_DECLARE_BUF(outbuf, BUFFER_SZ); + + MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index); + + ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), &outlen); + if (!ret) + memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG), + (ADEC_NUM * 4)); +} + +static int setup_mcdi(struct mc_priv *mc_priv) +{ + struct cdx_mcdi *amd_mcdi; + int ret, i; + + amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL); + if (!amd_mcdi) + return -ENOMEM; + + amd_mcdi->mcdi_ops = &mcdi_ops; + ret = cdx_mcdi_init(amd_mcdi); + if (ret) { + kfree(amd_mcdi); + return ret; + } + + amd_mcdi->ept = mc_priv->ept; + mc_priv->mcdi = amd_mcdi; + + for (i = 0; i < NUM_CONTROLLERS; i++) + get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi); + + return 0; +} + +static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2, + 0xb8, 0xb4, 0x45, 0x56, 0x2e, + 0x8c, 0x5b, 0xec); + +static int rpmsg_cb(struct rpmsg_device *rpdev, void *data, + int len, void *priv, u32 src) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + const guid_t *sec_type = &guid_null; + u32 length, offset, error_id; + u32 *result = (u32 *)data; + struct ecc_status *p; + int i, j, k, sec_sev; + const char *err_str; + u32 *adec_data; + + if (*(u8 *)data == MCDI_RESPONSE) { + cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len); + return 0; + } + + sec_sev = result[ERROR_LEVEL]; + error_id = result[ERROR_ID]; + length = result[MSG_ERR_LENGTH]; + offset = result[MSG_ERR_OFFSET]; + + if (result[TOTAL_ERR_LENGTH] > length) { + if (!mc_priv->part_len) + mc_priv->part_len = length; + else + mc_priv->part_len += length; + /* + * The data can come in 2 stretches. Construct the regs from 2 + * messages the offset indicates the offset from which the data is to + * be taken + */ + for (i = 0 ; i < length; i++) { + k = offset + i; + j = ERROR_DATA + i; + mc_priv->regs[k] = result[j]; + } + if (mc_priv->part_len < result[TOTAL_ERR_LENGTH]) + return 0; + mc_priv->part_len = 0; + } + + mc_priv->error_id = error_id; + mc_priv->error_level = result[ERROR_LEVEL]; + + switch (error_id) { + case 5: err_str = "General Software Non-Correctable error"; break; + case 6: err_str = "CFU error"; break; + case 7: err_str = "CFRAME error"; break; + case 10: err_str = "DDRMC Microblaze Correctable ECC error"; break; + case 11: err_str = "DDRMC Microblaze Non-Correctable ECC error"; break; + case 15: err_str = "MMCM error"; break; + case 16: err_str = "HNICX Correctable error"; break; + case 17: err_str = "HNICX Non-Correctable error"; break; + + case 18: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_CE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + case 19: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_UE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + + case 21: err_str = "GT Non-Correctable error"; break; + case 22: err_str = "PL Sysmon Correctable error"; break; + case 23: err_str = "PL Sysmon Non-Correctable error"; break; + case 111: err_str = "LPX unexpected dfx activation error"; break; + case 114: err_str = "INT_LPD Non-Correctable error"; break; + case 116: err_str = "INT_OCM Non-Correctable error"; break; + case 117: err_str = "INT_FPD Correctable error"; break; + case 118: err_str = "INT_FPD Non-Correctable error"; break; + case 120: err_str = "INT_IOU Non-Correctable error"; break; + case 123: err_str = "err_int_irq from APU GIC Distributor"; break; + case 124: err_str = "fault_int_irq from APU GIC Distribute"; break; + case 132 ... 139: err_str = "FPX SPLITTER error"; break; + case 140: err_str = "APU Cluster 0 error"; break; + case 141: err_str = "APU Cluster 1 error"; break; + case 142: err_str = "APU Cluster 2 error"; break; + case 143: err_str = "APU Cluster 3 error"; break; + case 145: err_str = "WWDT1 LPX error"; break; + case 147: err_str = "IPI error"; break; + case 152 ... 153: err_str = "AFIFS error"; break; + case 154 ... 155: err_str = "LPX glitch error"; break; + case 185 ... 186: err_str = "FPX AFIFS error"; break; + case 195 ... 199: err_str = "AFIFM error"; break; + case 108: err_str = "PSM Correctable error"; break; + case 59: err_str = "PMC correctable error"; break; + case 60: err_str = "PMC Un correctable error"; break; + case 43 ... 47: err_str = "PMC Sysmon error"; break; + case 163 ... 184: err_str = "RPU error"; break; + case 148: err_str = "OCM0 correctable error"; break; + case 149: err_str = "OCM1 correctable error"; break; + case 150: err_str = "OCM0 Un-correctable error"; break; + case 151: err_str = "OCM1 Un-correctable error"; break; + case 189: err_str = "PSX_CMN_3 PD block consolidated error"; break; + case 191: err_str = "FPD_INT_WRAP PD block consolidated error"; break; + case 232: err_str = "CRAM Un-Correctable error"; break; + default: err_str = "VERSAL_EDAC_ERR_ID: %d"; break; + } + + snprintf(mc_priv->message, + sizeof(mc_priv->message), + "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str); + + /* Convert to bytes */ + length = result[TOTAL_ERR_LENGTH] * 4; + log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message, + sec_sev, (void *)&result[ERROR_DATA], length); + + return 0; +} + +static struct rpmsg_device_id amd_rpmsg_id_table[] = { + { .name = "error_ipc" }, + { }, +}; +MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table); + +static int rpmsg_probe(struct rpmsg_device *rpdev) +{ + struct rpmsg_channel_info chinfo; + struct mc_priv *pg; + + pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data; + chinfo.src = RPMSG_ADDR_ANY; + chinfo.dst = rpdev->dst; + strscpy(chinfo.name, amd_rpmsg_id_table[0].name, + strlen(amd_rpmsg_id_table[0].name)); + + pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo); + if (!pg->ept) + return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n", + chinfo.name); + + dev_set_drvdata(&rpdev->dev, pg); + + return 0; +} + +static void rpmsg_remove(struct rpmsg_device *rpdev) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + + rpmsg_destroy_ept(mc_priv->ept); + dev_set_drvdata(&rpdev->dev, NULL); +} + +static struct rpmsg_driver amd_rpmsg_driver = { + .drv.name = KBUILD_MODNAME, + .probe = rpmsg_probe, + .remove = rpmsg_remove, + .callback = rpmsg_cb, + .id_table = amd_rpmsg_id_table, +}; + +static void versal_edac_release(struct device *dev) +{ + kfree(dev); +} + +static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev) +{ + u32 num_chans, rank, dwidth, config; + struct edac_mc_layer layers[2]; + struct mem_ctl_info *mci; + struct device *dev; + enum dev_type dt; + char *name; + int rc, i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + config = priv->adec[CONF + i * ADEC_NUM]; + num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config); + rank = 1 << FIELD_GET(MC5_RANK_MASK, config); + dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config); + + switch (dwidth) { + case XDDR5_BUS_WIDTH_16: + dt = DEV_X16; + break; + case XDDR5_BUS_WIDTH_32: + dt = DEV_X32; + break; + case XDDR5_BUS_WIDTH_64: + dt = DEV_X64; + break; + default: + dt = DEV_UNKNOWN; + } + + if (dt == DEV_UNKNOWN) + continue; + + /* Find the first enabled device and register that one. */ + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = rank; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = num_chans; + layers[1].is_virt_csrow = false; + + rc = -ENOMEM; + mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers, + sizeof(struct mc_priv)); + if (!mci) { + edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i); + goto err_alloc; + } + + priv->mci[i] = mci; + priv->dwidth = dt; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev->release = versal_edac_release; + name = kmalloc(32, GFP_KERNEL); + sprintf(name, "versal-net-ddrmc5-edac-%d", i); + dev->init_name = name; + rc = device_register(dev); + if (rc) + goto err_alloc; + + mci->pdev = dev; + + platform_set_drvdata(pdev, priv); + + mc_init(mci, dev); + rc = edac_mc_add_mc(mci); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i); + goto err_alloc; + } + } + return 0; + +err_alloc: + while (i--) { + mci = priv->mci[i]; + if (!mci) + continue; + + if (mci->pdev) { + device_unregister(mci->pdev); + edac_mc_del_mc(mci->pdev); + } + + edac_mc_free(mci); + } + + return rc; +} + +static void remove_versalnet(struct mc_priv *priv) +{ + struct mem_ctl_info *mci; + int i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + device_unregister(priv->mci[i]->pdev); + mci = edac_mc_del_mc(priv->mci[i]->pdev); + if (!mci) + return; + + edac_mc_free(mci); + } +} + +static int mc_probe(struct platform_device *pdev) +{ + struct device_node *r5_core_node; + struct mc_priv *priv; + struct rproc *rp; + int rc; + + r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0); + if (!r5_core_node) { + dev_err(&pdev->dev, "amd,rproc: invalid phandle\n"); + return -EINVAL; + } + + rp = rproc_get_by_phandle(r5_core_node->phandle); + if (!rp) + return -EPROBE_DEFER; + + rc = rproc_boot(rp); + if (rc) { + dev_err(&pdev->dev, "Failed to attach to remote processor\n"); + goto err_rproc_boot; + } + + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + goto err_alloc; + + amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv; + + rc = register_rpmsg_driver(&amd_rpmsg_driver); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc); + goto err_alloc; + } + + rc = setup_mcdi(priv); + if (rc) + goto err_unreg; + + priv->mcdi->r5_rproc = rp; + + rc = init_versalnet(priv, pdev); + if (rc) + goto err_init; + + return 0; + +err_init: + cdx_mcdi_finish(priv->mcdi); + +err_unreg: + unregister_rpmsg_driver(&amd_rpmsg_driver); + +err_alloc: + rproc_shutdown(rp); + +err_rproc_boot: + rproc_put(rp); + + return rc; +} + +static void mc_remove(struct platform_device *pdev) +{ + struct mc_priv *priv = platform_get_drvdata(pdev); + + unregister_rpmsg_driver(&amd_rpmsg_driver); + remove_versalnet(priv); + rproc_shutdown(priv->mcdi->r5_rproc); + cdx_mcdi_finish(priv->mcdi); +} + +static const struct of_device_id amd_edac_match[] = { + { .compatible = "xlnx,versal-net-ddrmc5", }, + {} +}; +MODULE_DEVICE_TABLE(of, amd_edac_match); + +static struct platform_driver amd_ddr_edac_mc_driver = { + .driver = { + .name = "versal-net-edac", + .of_match_table = amd_edac_match, + }, + .probe = mc_probe, + .remove = mc_remove, +}; + +module_platform_driver(amd_ddr_edac_mc_driver); + +MODULE_AUTHOR("AMD Inc"); +MODULE_DESCRIPTION("Versal NET EDAC driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/cdx/edac_cdx_pcol.h b/include/linux/cdx/edac_cdx_pcol.h new file mode 100644 index 000000000000..749db33bb482 --- /dev/null +++ b/include/linux/cdx/edac_cdx_pcol.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Driver for AMD network controllers and boards + * + * Copyright (C) 2021, Xilinx, Inc. + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#ifndef MC_CDX_PCOL_H +#define MC_CDX_PCOL_H +#include + +#define MC_CMD_EDAC_GET_DDR_CONFIG_OUT_WORD_LENGTH_LEN 4 +/* Number of registers for the DDR controller */ +#define MC_CMD_GET_DDR_CONFIG_OFST 4 +#define MC_CMD_GET_DDR_CONFIG_LEN 4 + +/***********************************/ +/* MC_CMD_EDAC_GET_DDR_CONFIG + * Provides detailed configuration for the DDR controller of the given index. + */ +#define MC_CMD_EDAC_GET_DDR_CONFIG 0x3 + +/* MC_CMD_EDAC_GET_DDR_CONFIG_IN msgrequest */ +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_OFST 0 +#define MC_CMD_EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX_LEN 4 + +#endif /* MC_CDX_PCOL_H */ -- cgit v1.2.3 From 1b3aa3900782707ec2f4cc1651bc82c628f25d2b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Sep 2025 17:45:36 -0600 Subject: io_uring/uring_cmd: correct signature for io_uring_mshot_cmd_post_cqe() The !CONFIG_IO_URING signature is wrong, fix that up. The non stub signature got updated for the io_br_sel changes that happened before this patch went in, but the stub one did not. Fixes: 620a50c92700 ("io_uring: uring_cmd: add multishot support") Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 1350af846ddd..c8185f54fde9 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -126,7 +126,7 @@ io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, return (struct io_br_sel) { .val = -EOPNOTSUPP }; } static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, - ssize_t ret, unsigned int issue_flags) + struct io_br_sel *sel, unsigned int issue_flags) { return true; } -- cgit v1.2.3 From 0cbaf65c91db0e40a577e8919979dac1963cfcc0 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:43 -0700 Subject: tee: add close_context to TEE driver operation The tee_context can be used to manage TEE user resources, including those allocated by the driver for the TEE on behalf of the user. The release() callback is invoked only when all resources, such as tee_shm, are released and there are no references to the tee_context. When a user closes the device file, the driver should notify the TEE to release any resources it may hold and drop the context references. To achieve this, a close_context() callback is introduced to initiate resource release in the TEE driver when the device file is closed. Relocate teedev_ctx_get, teedev_ctx_put, tee_device_get, and tee_device_get functions to tee_core.h to make them accessible outside the TEE subsystem. Reviewed-by: Sumit Garg Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 7 +++++++ drivers/tee/tee_private.h | 6 ------ include/linux/tee_core.h | 50 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 55 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 68f556898836..5a7fce5b6007 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -80,6 +80,7 @@ void teedev_ctx_get(struct tee_context *ctx) kref_get(&ctx->refcount); } +EXPORT_SYMBOL_GPL(teedev_ctx_get); static void teedev_ctx_release(struct kref *ref) { @@ -97,11 +98,15 @@ void teedev_ctx_put(struct tee_context *ctx) kref_put(&ctx->refcount, teedev_ctx_release); } +EXPORT_SYMBOL_GPL(teedev_ctx_put); void teedev_close_context(struct tee_context *ctx) { struct tee_device *teedev = ctx->teedev; + if (teedev->desc->ops->close_context) + teedev->desc->ops->close_context(ctx); + teedev_ctx_put(ctx); tee_device_put(teedev); } @@ -1112,6 +1117,7 @@ void tee_device_put(struct tee_device *teedev) } mutex_unlock(&teedev->mutex); } +EXPORT_SYMBOL_GPL(tee_device_put); bool tee_device_get(struct tee_device *teedev) { @@ -1124,6 +1130,7 @@ bool tee_device_get(struct tee_device *teedev) mutex_unlock(&teedev->mutex); return true; } +EXPORT_SYMBOL_GPL(tee_device_get); /** * tee_device_unregister() - Removes a TEE device diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h index a9b5e4a6a8f7..6bde688bfcb1 100644 --- a/drivers/tee/tee_private.h +++ b/drivers/tee/tee_private.h @@ -23,12 +23,6 @@ struct tee_shm_dmabuf_ref { int tee_shm_get_fd(struct tee_shm *shm); -bool tee_device_get(struct tee_device *teedev); -void tee_device_put(struct tee_device *teedev); - -void teedev_ctx_get(struct tee_context *ctx); -void teedev_ctx_put(struct tee_context *ctx); - struct tee_shm *tee_shm_alloc_user_buf(struct tee_context *ctx, size_t size); struct tee_shm *tee_shm_register_user_buf(struct tee_context *ctx, unsigned long addr, size_t length); diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index 7b0c1da2ca6c..456a940d4710 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -76,8 +76,9 @@ struct tee_device { /** * struct tee_driver_ops - driver operations vtable * @get_version: returns version of driver - * @open: called when the device file is opened - * @release: release this open file + * @open: called for a context when the device file is opened + * @close_context: called when the device file is closed + * @release: called to release the context * @open_session: open a new session * @close_session: close a session * @system_session: declare session as a system session @@ -87,11 +88,17 @@ struct tee_device { * @supp_send: called for supplicant to send a response * @shm_register: register shared memory buffer in TEE * @shm_unregister: unregister shared memory buffer in TEE + * + * The context given to @open might last longer than the device file if it is + * tied to other resources in the TEE driver. @close_context is called when the + * client closes the device file, even if there are existing references to the + * context. The TEE driver can use @close_context to start cleaning up. */ struct tee_driver_ops { void (*get_version)(struct tee_device *teedev, struct tee_ioctl_version_data *vers); int (*open)(struct tee_context *ctx); + void (*close_context)(struct tee_context *ctx); void (*release)(struct tee_context *ctx); int (*open_session)(struct tee_context *ctx, struct tee_ioctl_open_session_arg *arg, @@ -200,6 +207,24 @@ int tee_device_register_dma_heap(struct tee_device *teedev, struct tee_protmem_pool *pool); void tee_device_put_all_dma_heaps(struct tee_device *teedev); +/** + * tee_device_get() - Increment the user count for a tee_device + * @teedev: Pointer to the tee_device + * + * If tee_device_unregister() has been called and the final user of @teedev + * has already released the device, this function will fail to prevent new users + * from accessing the device during the unregistration process. + * + * Returns: true if @teedev remains valid, otherwise false + */ +bool tee_device_get(struct tee_device *teedev); + +/** + * tee_device_put() - Decrease the user count for a tee_device + * @teedev: pointer to the tee_device + */ +void tee_device_put(struct tee_device *teedev); + /** * tee_device_set_dev_groups() - Set device attribute groups * @teedev: Device to register @@ -374,4 +399,25 @@ struct tee_context *teedev_open(struct tee_device *teedev); */ void teedev_close_context(struct tee_context *ctx); +/** + * teedev_ctx_get() - Increment the reference count of a context + * @ctx: Pointer to the context + * + * This function increases the refcount of the context, which is tied to + * resources shared by the same tee_device. During the unregistration process, + * the context may remain valid even after tee_device_unregister() has returned. + * + * Users should ensure that the context's refcount is properly decreased before + * calling tee_device_put(), typically within the context's release() function. + * Alternatively, users can call tee_device_get() and teedev_ctx_get() together + * and release them simultaneously (see shm_alloc_helper()). + */ +void teedev_ctx_get(struct tee_context *ctx); + +/** + * teedev_ctx_put() - Decrease reference count on a context + * @ctx: pointer to the context + */ +void teedev_ctx_put(struct tee_context *ctx); + #endif /*__TEE_CORE_H*/ -- cgit v1.2.3 From 54a53e95a908a4cc770f0530c49f04c89e7b18dc Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:44 -0700 Subject: tee: add TEE_IOCTL_PARAM_ATTR_TYPE_UBUF For drivers that can transfer data to the TEE without using shared memory from client, it is necessary to receive the user address directly, bypassing any processing by the TEE subsystem. Introduce TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT/OUTPUT/INOUT to represent userspace buffers. Reviewed-by: Sumit Garg Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 33 +++++++++++++++++++++++++++++++++ include/linux/tee_drv.h | 6 ++++++ include/uapi/linux/tee.h | 22 ++++++++++++++++------ 3 files changed, 55 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 5a7fce5b6007..529738565ebd 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -494,6 +494,17 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, params[n].u.value.b = ip.b; params[n].u.value.c = ip.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + params[n].u.ubuf.uaddr = u64_to_user_ptr(ip.a); + params[n].u.ubuf.size = ip.b; + + if (!access_ok(params[n].u.ubuf.uaddr, + params[n].u.ubuf.size)) + return -EFAULT; + + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -527,6 +538,11 @@ static int params_to_user(struct tee_ioctl_param __user *uparams, put_user(p->u.value.c, &up->c)) return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + if (put_user((u64)p->u.ubuf.size, &up->b)) + return -EFAULT; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: if (put_user((u64)p->u.memref.size, &up->b)) @@ -727,6 +743,13 @@ static int params_to_supp(struct tee_context *ctx, ip.b = p->u.value.b; ip.c = p->u.value.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + ip.a = (__force unsigned long)p->u.ubuf.uaddr; + ip.b = p->u.ubuf.size; + ip.c = 0; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -829,6 +852,16 @@ static int params_from_supp(struct tee_param *params, size_t num_params, p->u.value.b = ip.b; p->u.value.c = ip.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + p->u.ubuf.uaddr = u64_to_user_ptr(ip.a); + p->u.ubuf.size = ip.b; + + if (!access_ok(params[n].u.ubuf.uaddr, + params[n].u.ubuf.size)) + return -EFAULT; + + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: /* diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 824f1251de60..7915e8869cbd 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -82,6 +82,11 @@ struct tee_param_memref { struct tee_shm *shm; }; +struct tee_param_ubuf { + void __user *uaddr; + size_t size; +}; + struct tee_param_value { u64 a; u64 b; @@ -92,6 +97,7 @@ struct tee_param { u64 attr; union { struct tee_param_memref memref; + struct tee_param_ubuf ubuf; struct tee_param_value value; } u; }; diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index d843cf980d98..0e3b735dcfca 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -151,6 +151,13 @@ struct tee_ioctl_buf_data { #define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT 6 #define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT 7 /* input and output */ +/* + * These defines userspace buffer parameters. + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT 8 +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT 9 +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT 10 /* input and output */ + /* * Mask for the type part of the attribute, leaves room for more types */ @@ -186,14 +193,17 @@ struct tee_ioctl_buf_data { /** * struct tee_ioctl_param - parameter * @attr: attributes - * @a: if a memref, offset into the shared memory object, else a value parameter - * @b: if a memref, size of the buffer, else a value parameter + * @a: if a memref, offset into the shared memory object, + * else if a ubuf, address of the user buffer, + * else a value parameter + * @b: if a memref or ubuf, size of the buffer, else a value parameter * @c: if a memref, shared memory identifier, else a value parameter * - * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref or value is used in - * the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value and - * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref. TEE_PARAM_ATTR_TYPE_NONE - * indicates that none of the members are used. + * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref, ubuf, or value is + * used in the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value, + * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, and TEE_PARAM_ATTR_TYPE_UBUF_* + * indicates ubuf. TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members + * are used. * * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an * identifier representing the shared memory object. A memref can reference -- cgit v1.2.3 From d5b8b0fa1775d8b59c3fc9e4aa2baa715d08f3ee Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:45 -0700 Subject: tee: add TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF The TEE subsystem allows session-based access to trusted services, requiring a session to be established to receive a service. This is not suitable for an environment that represents services as objects. An object supports various operations that a client can invoke, potentially generating a result or a new object that can be invoked independently of the original object. Add TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT/OUTPUT/INOUT to represent an object. Objects may reside in either TEE or userspace. To invoke an object in TEE, introduce a new ioctl. Use the existing SUPPL_RECV and SUPPL_SEND to invoke an object in userspace. Reviewed-by: Sumit Garg Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/tee_core.h | 4 +++ include/linux/tee_drv.h | 6 ++++ include/uapi/linux/tee.h | 41 +++++++++++++++++++---- 4 files changed, 130 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 529738565ebd..71b1bd69f067 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -487,6 +487,7 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, switch (ip.attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) { case TEE_IOCTL_PARAM_ATTR_TYPE_NONE: case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: break; case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT: @@ -505,6 +506,11 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + params[n].u.objref.id = ip.a; + params[n].u.objref.flags = ip.b; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -543,6 +549,12 @@ static int params_to_user(struct tee_ioctl_param __user *uparams, if (put_user((u64)p->u.ubuf.size, &up->b)) return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + if (put_user(p->u.objref.id, &up->a) || + put_user(p->u.objref.flags, &up->b)) + return -EFAULT; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: if (put_user((u64)p->u.memref.size, &up->b)) @@ -695,6 +707,66 @@ out: return rc; } +static int tee_ioctl_object_invoke(struct tee_context *ctx, + struct tee_ioctl_buf_data __user *ubuf) +{ + int rc; + size_t n; + struct tee_ioctl_buf_data buf; + struct tee_ioctl_object_invoke_arg __user *uarg; + struct tee_ioctl_object_invoke_arg arg; + struct tee_ioctl_param __user *uparams = NULL; + struct tee_param *params = NULL; + + if (!ctx->teedev->desc->ops->object_invoke_func) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, sizeof(buf))) + return -EFAULT; + + if (buf.buf_len > TEE_MAX_ARG_SIZE || + buf.buf_len < sizeof(struct tee_ioctl_object_invoke_arg)) + return -EINVAL; + + uarg = u64_to_user_ptr(buf.buf_ptr); + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (sizeof(arg) + TEE_IOCTL_PARAM_SIZE(arg.num_params) != buf.buf_len) + return -EINVAL; + + if (arg.num_params) { + params = kcalloc(arg.num_params, sizeof(struct tee_param), + GFP_KERNEL); + if (!params) + return -ENOMEM; + uparams = uarg->params; + rc = params_from_user(ctx, params, arg.num_params, uparams); + if (rc) + goto out; + } + + rc = ctx->teedev->desc->ops->object_invoke_func(ctx, &arg, params); + if (rc) + goto out; + + if (put_user(arg.ret, &uarg->ret)) { + rc = -EFAULT; + goto out; + } + rc = params_to_user(uparams, arg.num_params, params); +out: + if (params) { + /* Decrease ref count for all valid shared memory pointers */ + for (n = 0; n < arg.num_params; n++) + if (tee_param_is_memref(params + n) && + params[n].u.memref.shm) + tee_shm_put(params[n].u.memref.shm); + kfree(params); + } + return rc; +} + static int tee_ioctl_cancel(struct tee_context *ctx, struct tee_ioctl_cancel_arg __user *uarg) { @@ -750,6 +822,12 @@ static int params_to_supp(struct tee_context *ctx, ip.b = p->u.ubuf.size; ip.c = 0; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + ip.a = p->u.objref.id; + ip.b = p->u.objref.flags; + ip.c = 0; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -862,6 +940,11 @@ static int params_from_supp(struct tee_param *params, size_t num_params, return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + p->u.objref.id = ip.a; + p->u.objref.flags = ip.b; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: /* @@ -944,6 +1027,8 @@ static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return tee_ioctl_open_session(ctx, uarg); case TEE_IOC_INVOKE: return tee_ioctl_invoke(ctx, uarg); + case TEE_IOC_OBJECT_INVOKE: + return tee_ioctl_object_invoke(ctx, uarg); case TEE_IOC_CANCEL: return tee_ioctl_cancel(ctx, uarg); case TEE_IOC_CLOSE_SESSION: diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index 456a940d4710..1f3e5dad6d0d 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -83,6 +83,7 @@ struct tee_device { * @close_session: close a session * @system_session: declare session as a system session * @invoke_func: invoke a trusted function + * @object_invoke_func: invoke a TEE object * @cancel_req: request cancel of an ongoing invoke or open * @supp_recv: called for supplicant to get a command * @supp_send: called for supplicant to send a response @@ -108,6 +109,9 @@ struct tee_driver_ops { int (*invoke_func)(struct tee_context *ctx, struct tee_ioctl_invoke_arg *arg, struct tee_param *param); + int (*object_invoke_func)(struct tee_context *ctx, + struct tee_ioctl_object_invoke_arg *arg, + struct tee_param *param); int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session); int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params, struct tee_param *param); diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 7915e8869cbd..88a6f9697c89 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -87,6 +87,11 @@ struct tee_param_ubuf { size_t size; }; +struct tee_param_objref { + u64 id; + u64 flags; +}; + struct tee_param_value { u64 a; u64 b; @@ -97,6 +102,7 @@ struct tee_param { u64 attr; union { struct tee_param_memref memref; + struct tee_param_objref objref; struct tee_param_ubuf ubuf; struct tee_param_value value; } u; diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 0e3b735dcfca..9abb0f299549 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -48,8 +48,10 @@ #define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */ #define TEE_GEN_CAP_REG_MEM (1 << 2)/* Supports registering shared memory */ #define TEE_GEN_CAP_MEMREF_NULL (1 << 3)/* NULL MemRef support */ +#define TEE_GEN_CAP_OBJREF (1 << 4)/* Supports generic object reference */ -#define TEE_MEMREF_NULL (__u64)(-1) /* NULL MemRef Buffer */ +#define TEE_MEMREF_NULL ((__u64)(-1)) /* NULL MemRef Buffer */ +#define TEE_OBJREF_NULL ((__u64)(-1)) /* NULL ObjRef Object */ /* * TEE Implementation ID @@ -158,6 +160,13 @@ struct tee_ioctl_buf_data { #define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT 9 #define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT 10 /* input and output */ +/* + * These defines object reference parameters. + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT 11 +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT 12 +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT 13 + /* * Mask for the type part of the attribute, leaves room for more types */ @@ -195,15 +204,16 @@ struct tee_ioctl_buf_data { * @attr: attributes * @a: if a memref, offset into the shared memory object, * else if a ubuf, address of the user buffer, - * else a value parameter - * @b: if a memref or ubuf, size of the buffer, else a value parameter + * else if an objref, object identifier, else a value parameter + * @b: if a memref or ubuf, size of the buffer, + * else if objref, flags for the object, else a value parameter * @c: if a memref, shared memory identifier, else a value parameter * * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref, ubuf, or value is * used in the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value, - * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, and TEE_PARAM_ATTR_TYPE_UBUF_* - * indicates ubuf. TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members - * are used. + * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, TEE_PARAM_ATTR_TYPE_UBUF_* + * indicates ubuf, and TEE_PARAM_ATTR_TYPE_OBJREF_* indicates objref. + * TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members are used. * * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an * identifier representing the shared memory object. A memref can reference @@ -442,4 +452,23 @@ struct tee_ioctl_shm_register_fd_data { * munmap(): unmaps previously shared memory */ +/** + * struct tee_ioctl_invoke_func_arg - Invokes an object in a Trusted Application + * @id: [in] Object id + * @op: [in] Object operation, specific to the object + * @ret: [out] return value + * @num_params: [in] number of parameters following this struct + */ +struct tee_ioctl_object_invoke_arg { + __u64 id; + __u32 op; + __u32 ret; + __u32 num_params; + /* num_params tells the actual number of element in params */ + struct tee_ioctl_param params[]; +}; + +#define TEE_IOC_OBJECT_INVOKE _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 10, \ + struct tee_ioctl_buf_data) + #endif /*__TEE_H*/ -- cgit v1.2.3 From 2c895133950646f45e5cf3900b168c952c8dbee8 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 15 Sep 2025 03:26:17 +0000 Subject: bpf: Do not limit bpf_cgroup_from_id to current's namespace The bpf_cgroup_from_id kfunc relies on cgroup_get_from_id to obtain the cgroup corresponding to a given cgroup ID. This helper can be called in a lot of contexts where the current thread can be random. A recent example was its use in sched_ext's ops.tick(), to obtain the root cgroup pointer. Since the current task can be whatever random user space task preempted by the timer tick, this makes the behavior of the helper unreliable. Refactor out __cgroup_get_from_id as the non-namespace aware version of cgroup_get_from_id, and change bpf_cgroup_from_id to make use of it. There is no compatibility breakage here, since changing the namespace against which the lookup is being done to the root cgroup namespace only permits a wider set of lookups to succeed now. The cgroup IDs across namespaces are globally unique, and thus don't need to be retranslated. Reported-by: Dan Schatzberg Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250915032618.1551762-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/cgroup.h | 1 + kernel/bpf/helpers.c | 2 +- kernel/cgroup/cgroup.c | 24 ++++++++++++++++++++---- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..b08c8e62881c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -650,6 +650,7 @@ static inline void cgroup_kthread_ready(void) } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); +struct cgroup *__cgroup_get_from_id(u64 id); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c0c0764a2025..51229aba5318 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2546,7 +2546,7 @@ __bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid) { struct cgroup *cgrp; - cgrp = cgroup_get_from_id(cgid); + cgrp = __cgroup_get_from_id(cgid); if (IS_ERR(cgrp)) return NULL; return cgrp; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..6c2d20ac697c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -6343,15 +6343,15 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) } /* - * cgroup_get_from_id : get the cgroup associated with cgroup id + * __cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id * On success return the cgrp or ERR_PTR on failure - * Only cgroups within current task's cgroup NS are valid. + * There are no cgroup NS restrictions. */ -struct cgroup *cgroup_get_from_id(u64 id) +struct cgroup *__cgroup_get_from_id(u64 id) { struct kernfs_node *kn; - struct cgroup *cgrp, *root_cgrp; + struct cgroup *cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) @@ -6373,6 +6373,22 @@ struct cgroup *cgroup_get_from_id(u64 id) if (!cgrp) return ERR_PTR(-ENOENT); + return cgrp; +} + +/* + * cgroup_get_from_id : get the cgroup associated with cgroup id + * @id: cgroup id + * On success return the cgrp or ERR_PTR on failure + * Only cgroups within current task's cgroup NS are valid. + */ +struct cgroup *cgroup_get_from_id(u64 id) +{ + struct cgroup *cgrp, *root_cgrp; + + cgrp = __cgroup_get_from_id(id); + if (IS_ERR(cgrp)) + return cgrp; root_cgrp = current_cgns_cgroup_dfl(); if (!cgroup_is_descendant(cgrp, root_cgrp)) { -- cgit v1.2.3 From c3426ba2ed6942fe33c75bf17fc7513ba2c6ac64 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Thu, 11 Sep 2025 13:06:31 +0200 Subject: tcp: reorganize tcp_sock_write_txrx group for variables later Use the first 3-byte hole at the beginning of the tcp_sock_write_txrx group for 'noneagle'/'rate_app_limited' to fill in the existing hole in later patches. Therefore, the group size of tcp_sock_write_txrx is reduced from 92 + 4 to 91 + 4. In addition, the group size of tcp_sock_write_rx is changed to 96 to fit in the pahole outcome. Below are the trimmed pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ /* XXX 3 bytes hole, try to pack */ [...] struct tcp_options_received rx_opt; /* 2588 24 */ u8 nonagle:4; /* 2612: 0 1 */ u8 rate_app_limited:1; /* 2612: 4 1 */ /* XXX 3 bits hole, try to pack */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2613 0 */ /* XXX 3 bytes hole, try to pack */ __cacheline_group_begin__tcp_sock_write_rx[0] __attribute__((__aligned__(8))); /* 2616 0 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2712 0 */ [...] /* size: 3200, cachelines: 50, members: 161 */ } [AFTER THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* XXX 2 bytes hole, try to pack */ [...] struct tcp_options_received rx_opt; /* 2588 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */ /* XXX 4 bytes hole, try to pack */ __cacheline_group_begin__tcp_sock_write_rx[0] __attribute__((__aligned__(8))); /* 2616 0 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2712 0 */ [...] /* size: 3200, cachelines: 50, members: 161 */ } Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250911110642.87529-4-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 4 ++-- net/ipv4/tcp.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 57e478bfaef2..d103cc0e7a35 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -285,6 +285,8 @@ struct tcp_sock { * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ + u8 nonagle : 4,/* Disable Nagle algorithm? */ + rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -303,8 +305,6 @@ struct tcp_sock { * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; - u8 nonagle : 4,/* Disable Nagle algorithm? */ - rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ __cacheline_group_end(tcp_sock_write_txrx); /* RX read-write hotpath cache lines */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7f9c671b1ee0..1f643faa8b93 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5145,7 +5145,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 91 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); @@ -5162,7 +5162,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96); } void __init tcp_init(void) -- cgit v1.2.3 From 07c446e35b89bc8774792f8036e595cffdf5b162 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:43 +0900 Subject: firewire: core: maintain phy packet receivers locally in cdev layer The list of receivers for phy packet is used only by cdev layer, while it is maintained as a member of fw_card structure. This commit maintains the list locally in cdev layer. Link: https://lore.kernel.org/r/20250915234747.915922-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 1 - drivers/firewire/core-cdev.c | 27 ++++++++++++++++++++------- include/linux/firewire.h | 2 -- 3 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index b5e01a711145..616abb836ef3 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -556,7 +556,6 @@ void fw_card_initialize(struct fw_card *card, kref_init(&card->kref); init_completion(&card->done); INIT_LIST_HEAD(&card->transaction_list); - INIT_LIST_HEAD(&card->phy_receiver_list); spin_lock_init(&card->lock); card->local_node = NULL; diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 1be8f5eb3e17..112b33099610 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -47,6 +47,9 @@ #define FW_CDEV_VERSION_AUTO_FLUSH_ISO_OVERFLOW 5 #define FW_CDEV_VERSION_EVENT_ASYNC_TSTAMP 6 +static DEFINE_SPINLOCK(phy_receiver_list_lock); +static LIST_HEAD(phy_receiver_list); + struct client { u32 version; struct fw_device *device; @@ -1669,15 +1672,16 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg) static int ioctl_receive_phy_packets(struct client *client, union ioctl_arg *arg) { struct fw_cdev_receive_phy_packets *a = &arg->receive_phy_packets; - struct fw_card *card = client->device->card; /* Access policy: Allow this ioctl only on local nodes' device files. */ if (!client->device->is_local) return -ENOSYS; - guard(spinlock_irq)(&card->lock); + // NOTE: This can be without irq when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irq, &phy_receiver_list_lock) + list_move_tail(&client->phy_receiver_link, &phy_receiver_list); - list_move_tail(&client->phy_receiver_link, &card->phy_receiver_list); client->phy_receiver_closure = a->closure; return 0; @@ -1687,10 +1691,17 @@ void fw_cdev_handle_phy_packet(struct fw_card *card, struct fw_packet *p) { struct client *client; - guard(spinlock_irqsave)(&card->lock); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + guard(spinlock_irqsave)(&phy_receiver_list_lock); + + list_for_each_entry(client, &phy_receiver_list, phy_receiver_link) { + struct inbound_phy_packet_event *e; + + if (client->device->card != card) + continue; - list_for_each_entry(client, &card->phy_receiver_list, phy_receiver_link) { - struct inbound_phy_packet_event *e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC); + e = kmalloc(sizeof(*e) + 8, GFP_ATOMIC); if (e == NULL) break; @@ -1857,7 +1868,9 @@ static int fw_device_op_release(struct inode *inode, struct file *file) struct client_resource *resource; unsigned long index; - scoped_guard(spinlock_irq, &client->device->card->lock) + // NOTE: This can be without irq when we can guarantee that __fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irq, &phy_receiver_list_lock) list_del(&client->phy_receiver_link); scoped_guard(mutex, &client->device->client_list_mutex) diff --git a/include/linux/firewire.h b/include/linux/firewire.h index d38c6e538e5c..f3260aacf730 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -115,8 +115,6 @@ struct fw_card { int index; struct list_head link; - struct list_head phy_receiver_list; - struct delayed_work br_work; /* bus reset job */ bool br_short; -- cgit v1.2.3 From 7d138cb269dbd2fa9b0da89a9c10503d1cf269d5 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:44 +0900 Subject: firewire: core: use spin lock specific to topology map At present, the operation for read transaction to topology map register is not protected by any kind of lock primitives. This causes a potential problem to result in the mixed content of topology map. This commit adds and uses spin lock specific to topology map. Link: https://lore.kernel.org/r/20250915234747.915922-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-topology.c | 22 ++++++++++++++-------- drivers/firewire/core-transaction.c | 6 +++++- include/linux/firewire.h | 6 +++++- 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 17aaf14cab0b..c62cf93f3f65 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -435,20 +435,22 @@ static void update_tree(struct fw_card *card, struct fw_node *root) } } -static void update_topology_map(struct fw_card *card, - u32 *self_ids, int self_id_count) +static void update_topology_map(__be32 *buffer, size_t buffer_size, int root_node_id, + const u32 *self_ids, int self_id_count) { - int node_count = (card->root_node->node_id & 0x3f) + 1; - __be32 *map = card->topology_map; + __be32 *map = buffer; + int node_count = (root_node_id & 0x3f) + 1; + + memset(map, 0, buffer_size); *map++ = cpu_to_be32((self_id_count + 2) << 16); - *map++ = cpu_to_be32(be32_to_cpu(card->topology_map[1]) + 1); + *map++ = cpu_to_be32(be32_to_cpu(buffer[1]) + 1); *map++ = cpu_to_be32((node_count << 16) | self_id_count); while (self_id_count--) *map++ = cpu_to_be32p(self_ids++); - fw_compute_block_crc(card->topology_map); + fw_compute_block_crc(buffer); } void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, @@ -479,8 +481,6 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, local_node = build_tree(card, self_ids, self_id_count, generation); - update_topology_map(card, self_ids, self_id_count); - card->color++; if (local_node == NULL) { @@ -493,5 +493,11 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation, update_tree(card, local_node); } } + + // Just used by transaction layer. + scoped_guard(spinlock, &card->topology_map.lock) { + update_topology_map(card->topology_map.buffer, sizeof(card->topology_map.buffer), + card->root_node->node_id, self_ids, self_id_count); + } } EXPORT_SYMBOL(fw_core_handle_bus_reset); diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 623e1d9bd107..8edffafd21c1 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -1196,7 +1196,11 @@ static void handle_topology_map(struct fw_card *card, struct fw_request *request } start = (offset - topology_map_region.start) / 4; - memcpy(payload, &card->topology_map[start], length); + + // NOTE: This can be without irqsave when we can guarantee that fw_send_request() for local + // destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->topology_map.lock) + memcpy(payload, &card->topology_map.buffer[start], length); fw_send_response(card, request, RCODE_COMPLETE); } diff --git a/include/linux/firewire.h b/include/linux/firewire.h index f3260aacf730..aeb71c39e57e 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -129,7 +129,11 @@ struct fw_card { bool broadcast_channel_allocated; u32 broadcast_channel; - __be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4]; + + struct { + __be32 buffer[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4]; + spinlock_t lock; + } topology_map; __be32 maint_utility_register; -- cgit v1.2.3 From 420bd7068cbfaea0a857472dd631dc48311e2a8f Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:45 +0900 Subject: firewire: core: use spin lock specific to transaction The list of instance for asynchronous transaction to wait for response subaction is maintained as a member of fw_card structure. The card-wide spinlock is used at present for any operation over the list, however it is not necessarily suited for the purpose. This commit adds and uses the spin lock specific to maintain the list. Link: https://lore.kernel.org/r/20250915234747.915922-5-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 12 +++++--- drivers/firewire/core-transaction.c | 59 ++++++++++++++++++++++--------------- include/linux/firewire.h | 10 +++++-- 3 files changed, 51 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 616abb836ef3..39f95007305f 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -544,8 +544,12 @@ void fw_card_initialize(struct fw_card *card, card->index = atomic_inc_return(&index); card->driver = driver; card->device = device; - card->current_tlabel = 0; - card->tlabel_mask = 0; + + card->transactions.current_tlabel = 0; + card->transactions.tlabel_mask = 0; + INIT_LIST_HEAD(&card->transactions.list); + spin_lock_init(&card->transactions.lock); + card->split_timeout_hi = DEFAULT_SPLIT_TIMEOUT / 8000; card->split_timeout_lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; card->split_timeout_cycles = DEFAULT_SPLIT_TIMEOUT; @@ -555,7 +559,7 @@ void fw_card_initialize(struct fw_card *card, kref_init(&card->kref); init_completion(&card->done); - INIT_LIST_HEAD(&card->transaction_list); + spin_lock_init(&card->lock); card->local_node = NULL; @@ -772,7 +776,7 @@ void fw_core_remove_card(struct fw_card *card) destroy_workqueue(card->isoc_wq); destroy_workqueue(card->async_wq); - WARN_ON(!list_empty(&card->transaction_list)); + WARN_ON(!list_empty(&card->transactions.list)); } EXPORT_SYMBOL(fw_core_remove_card); diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 8edffafd21c1..5366d8a781ac 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -49,12 +49,14 @@ static int close_transaction(struct fw_transaction *transaction, struct fw_card { struct fw_transaction *t = NULL, *iter; - scoped_guard(spinlock_irqsave, &card->lock) { - list_for_each_entry(iter, &card->transaction_list, link) { + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) { + list_for_each_entry(iter, &card->transactions.list, link) { if (iter == transaction) { if (try_cancel_split_timeout(iter)) { list_del_init(&iter->link); - card->tlabel_mask &= ~(1ULL << iter->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << iter->tlabel); t = iter; } break; @@ -117,11 +119,11 @@ static void split_transaction_timeout_callback(struct timer_list *timer) struct fw_transaction *t = timer_container_of(t, timer, split_timeout_timer); struct fw_card *card = t->card; - scoped_guard(spinlock_irqsave, &card->lock) { + scoped_guard(spinlock_irqsave, &card->transactions.lock) { if (list_empty(&t->link)) return; list_del(&t->link); - card->tlabel_mask &= ~(1ULL << t->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << t->tlabel); } if (!t->with_tstamp) { @@ -259,18 +261,21 @@ static void fw_fill_request(struct fw_packet *packet, int tcode, int tlabel, } static int allocate_tlabel(struct fw_card *card) +__must_hold(&card->transactions_lock) { int tlabel; - tlabel = card->current_tlabel; - while (card->tlabel_mask & (1ULL << tlabel)) { + lockdep_assert_held(&card->transactions.lock); + + tlabel = card->transactions.current_tlabel; + while (card->transactions.tlabel_mask & (1ULL << tlabel)) { tlabel = (tlabel + 1) & 0x3f; - if (tlabel == card->current_tlabel) + if (tlabel == card->transactions.current_tlabel) return -EBUSY; } - card->current_tlabel = (tlabel + 1) & 0x3f; - card->tlabel_mask |= 1ULL << tlabel; + card->transactions.current_tlabel = (tlabel + 1) & 0x3f; + card->transactions.tlabel_mask |= 1ULL << tlabel; return tlabel; } @@ -331,7 +336,6 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode void *payload, size_t length, union fw_transaction_callback callback, bool with_tstamp, void *callback_data) { - unsigned long flags; int tlabel; /* @@ -339,11 +343,11 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode * the list while holding the card spinlock. */ - spin_lock_irqsave(&card->lock, flags); - - tlabel = allocate_tlabel(card); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) + tlabel = allocate_tlabel(card); if (tlabel < 0) { - spin_unlock_irqrestore(&card->lock, flags); if (!with_tstamp) { callback.without_tstamp(card, RCODE_SEND_ERROR, NULL, 0, callback_data); } else { @@ -368,15 +372,22 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode t->callback = callback; t->with_tstamp = with_tstamp; t->callback_data = callback_data; - - fw_fill_request(&t->packet, tcode, t->tlabel, destination_id, card->node_id, generation, - speed, offset, payload, length); t->packet.callback = transmit_complete_callback; - list_add_tail(&t->link, &card->transaction_list); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->lock) { + // The node_id field of fw_card can be updated when handling SelfIDComplete. + fw_fill_request(&t->packet, tcode, t->tlabel, destination_id, card->node_id, + generation, speed, offset, payload, length); + } - spin_unlock_irqrestore(&card->lock, flags); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) + list_add_tail(&t->link, &card->transactions.list); + // Safe with no lock, since the index field of fw_card is immutable once assigned. trace_async_request_outbound_initiate((uintptr_t)t, card->index, generation, speed, t->packet.header, payload, tcode_is_read_request(tcode) ? 0 : length / 4); @@ -1111,12 +1122,14 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *p) break; } - scoped_guard(spinlock_irqsave, &card->lock) { - list_for_each_entry(iter, &card->transaction_list, link) { + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->transactions.lock) { + list_for_each_entry(iter, &card->transactions.list, link) { if (iter->node_id == source && iter->tlabel == tlabel) { if (try_cancel_split_timeout(iter)) { list_del_init(&iter->link); - card->tlabel_mask &= ~(1ULL << iter->tlabel); + card->transactions.tlabel_mask &= ~(1ULL << iter->tlabel); t = iter; } break; diff --git a/include/linux/firewire.h b/include/linux/firewire.h index aeb71c39e57e..8d6801cf2fca 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -88,11 +88,15 @@ struct fw_card { int node_id; int generation; - int current_tlabel; - u64 tlabel_mask; - struct list_head transaction_list; u64 reset_jiffies; + struct { + int current_tlabel; + u64 tlabel_mask; + struct list_head list; + spinlock_t lock; + } transactions; + u32 split_timeout_hi; u32 split_timeout_lo; unsigned int split_timeout_cycles; -- cgit v1.2.3 From b5725cfa4120a4d234ab112aad151d731531d093 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Tue, 16 Sep 2025 08:47:46 +0900 Subject: firewire: core: use spin lock specific to timer for split transaction At present the parameters to compute timeout time for split transaction is protected by card-wide spin lock, while it is not necessarily convenient in a point to narrower critical section. This commit adds and uses another spin lock specific for the purpose. Link: https://lore.kernel.org/r/20250915234747.915922-6-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- drivers/firewire/core-card.c | 10 +++--- drivers/firewire/core-transaction.c | 63 ++++++++++++++++++++++++------------- include/linux/firewire.h | 15 +++++---- 3 files changed, 57 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 39f95007305f..96495392a1f6 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -550,10 +550,12 @@ void fw_card_initialize(struct fw_card *card, INIT_LIST_HEAD(&card->transactions.list); spin_lock_init(&card->transactions.lock); - card->split_timeout_hi = DEFAULT_SPLIT_TIMEOUT / 8000; - card->split_timeout_lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; - card->split_timeout_cycles = DEFAULT_SPLIT_TIMEOUT; - card->split_timeout_jiffies = isoc_cycles_to_jiffies(DEFAULT_SPLIT_TIMEOUT); + card->split_timeout.hi = DEFAULT_SPLIT_TIMEOUT / 8000; + card->split_timeout.lo = (DEFAULT_SPLIT_TIMEOUT % 8000) << 19; + card->split_timeout.cycles = DEFAULT_SPLIT_TIMEOUT; + card->split_timeout.jiffies = isoc_cycles_to_jiffies(DEFAULT_SPLIT_TIMEOUT); + spin_lock_init(&card->split_timeout.lock); + card->color = 0; card->broadcast_channel = BROADCAST_CHANNEL_INITIAL; diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c index 5366d8a781ac..dd3656a0c1ff 100644 --- a/drivers/firewire/core-transaction.c +++ b/drivers/firewire/core-transaction.c @@ -137,14 +137,18 @@ static void split_transaction_timeout_callback(struct timer_list *timer) static void start_split_transaction_timeout(struct fw_transaction *t, struct fw_card *card) { - guard(spinlock_irqsave)(&card->lock); + unsigned long delta; if (list_empty(&t->link) || WARN_ON(t->is_split_transaction)) return; t->is_split_transaction = true; - mod_timer(&t->split_timeout_timer, - jiffies + card->split_timeout_jiffies); + + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) + delta = card->split_timeout.jiffies; + mod_timer(&t->split_timeout_timer, jiffies + delta); } static u32 compute_split_timeout_timestamp(struct fw_card *card, u32 request_timestamp); @@ -164,8 +168,12 @@ static void transmit_complete_callback(struct fw_packet *packet, break; case ACK_PENDING: { - t->split_timeout_cycle = - compute_split_timeout_timestamp(card, packet->timestamp) & 0xffff; + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + t->split_timeout_cycle = + compute_split_timeout_timestamp(card, packet->timestamp) & 0xffff; + } start_split_transaction_timeout(t, card); break; } @@ -790,11 +798,14 @@ EXPORT_SYMBOL(fw_fill_response); static u32 compute_split_timeout_timestamp(struct fw_card *card, u32 request_timestamp) +__must_hold(&card->split_timeout.lock) { unsigned int cycles; u32 timestamp; - cycles = card->split_timeout_cycles; + lockdep_assert_held(&card->split_timeout.lock); + + cycles = card->split_timeout.cycles; cycles += request_timestamp & 0x1fff; timestamp = request_timestamp & ~0x1fff; @@ -845,9 +856,12 @@ static struct fw_request *allocate_request(struct fw_card *card, return NULL; kref_init(&request->kref); + // NOTE: This can be without irqsave when we can guarantee that __fw_send_request() for + // local destination never runs in any type of IRQ context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) + request->response.timestamp = compute_split_timeout_timestamp(card, p->timestamp); + request->response.speed = p->speed; - request->response.timestamp = - compute_split_timeout_timestamp(card, p->timestamp); request->response.generation = p->generation; request->response.ack = 0; request->response.callback = free_response_callback; @@ -1228,16 +1242,17 @@ static const struct fw_address_region registers_region = .end = CSR_REGISTER_BASE | CSR_CONFIG_ROM, }; static void update_split_timeout(struct fw_card *card) +__must_hold(&card->split_timeout.lock) { unsigned int cycles; - cycles = card->split_timeout_hi * 8000 + (card->split_timeout_lo >> 19); + cycles = card->split_timeout.hi * 8000 + (card->split_timeout.lo >> 19); /* minimum per IEEE 1394, maximum which doesn't overflow OHCI */ cycles = clamp(cycles, 800u, 3u * 8000u); - card->split_timeout_cycles = cycles; - card->split_timeout_jiffies = isoc_cycles_to_jiffies(cycles); + card->split_timeout.cycles = cycles; + card->split_timeout.jiffies = isoc_cycles_to_jiffies(cycles); } static void handle_registers(struct fw_card *card, struct fw_request *request, @@ -1287,12 +1302,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, case CSR_SPLIT_TIMEOUT_HI: if (tcode == TCODE_READ_QUADLET_REQUEST) { - *data = cpu_to_be32(card->split_timeout_hi); + *data = cpu_to_be32(card->split_timeout.hi); } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { - guard(spinlock_irqsave)(&card->lock); - - card->split_timeout_hi = be32_to_cpu(*data) & 7; - update_split_timeout(card); + // NOTE: This can be without irqsave when we can guarantee that + // __fw_send_request() for local destination never runs in any type of IRQ + // context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + card->split_timeout.hi = be32_to_cpu(*data) & 7; + update_split_timeout(card); + } } else { rcode = RCODE_TYPE_ERROR; } @@ -1300,12 +1318,15 @@ static void handle_registers(struct fw_card *card, struct fw_request *request, case CSR_SPLIT_TIMEOUT_LO: if (tcode == TCODE_READ_QUADLET_REQUEST) { - *data = cpu_to_be32(card->split_timeout_lo); + *data = cpu_to_be32(card->split_timeout.lo); } else if (tcode == TCODE_WRITE_QUADLET_REQUEST) { - guard(spinlock_irqsave)(&card->lock); - - card->split_timeout_lo = be32_to_cpu(*data) & 0xfff80000; - update_split_timeout(card); + // NOTE: This can be without irqsave when we can guarantee that + // __fw_send_request() for local destination never runs in any type of IRQ + // context. + scoped_guard(spinlock_irqsave, &card->split_timeout.lock) { + card->split_timeout.lo = be32_to_cpu(*data) & 0xfff80000; + update_split_timeout(card); + } } else { rcode = RCODE_TYPE_ERROR; } diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 8d6801cf2fca..6d208769d456 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -97,18 +97,21 @@ struct fw_card { spinlock_t lock; } transactions; - u32 split_timeout_hi; - u32 split_timeout_lo; - unsigned int split_timeout_cycles; - unsigned int split_timeout_jiffies; + struct { + u32 hi; + u32 lo; + unsigned int cycles; + unsigned int jiffies; + spinlock_t lock; + } split_timeout; unsigned long long guid; unsigned max_receive; int link_speed; int config_rom_generation; - spinlock_t lock; /* Take this lock when handling the lists in - * this struct. */ + spinlock_t lock; + struct fw_node *local_node; struct fw_node *root_node; struct fw_node *irm_node; -- cgit v1.2.3 From f9fadf23c7f1a0df72ef50a873e1bd3bd4631ec1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 4 Feb 2024 21:25:18 -0500 Subject: security_dentry_init_security(): constify qstr argument Nothing outside of fs/dcache.c has any business modifying dentry names; passing &dentry->d_name as an argument should have that argument declared as a const pointer. Acked-by: Casey Schaufler # smack part Acked-by: Paul Moore Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 4 ++-- security/security.c | 2 +- security/selinux/hooks.c | 2 +- security/smack/smack_lsm.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index fd11fffdd3c3..aa4d6ec9c98b 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -85,7 +85,7 @@ LSM_HOOK(int, -EOPNOTSUPP, dentry_init_security, struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, struct lsm_context *cp) LSM_HOOK(int, 0, dentry_create_files_as, struct dentry *dentry, int mode, - struct qstr *name, const struct cred *old, struct cred *new) + const struct qstr *name, const struct cred *old, struct cred *new) #ifdef CONFIG_SECURITY_PATH LSM_HOOK(int, 0, path_unlink, const struct path *dir, struct dentry *dentry) diff --git a/include/linux/security.h b/include/linux/security.h index 521bcb5b9717..3f694d3ebd70 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -391,7 +391,7 @@ int security_dentry_init_security(struct dentry *dentry, int mode, const char **xattr_name, struct lsm_context *lsmcxt); int security_dentry_create_files_as(struct dentry *dentry, int mode, - struct qstr *name, + const struct qstr *name, const struct cred *old, struct cred *new); int security_path_notify(const struct path *path, u64 mask, @@ -871,7 +871,7 @@ static inline int security_dentry_init_security(struct dentry *dentry, } static inline int security_dentry_create_files_as(struct dentry *dentry, - int mode, struct qstr *name, + int mode, const struct qstr *name, const struct cred *old, struct cred *new) { diff --git a/security/security.c b/security/security.c index ad163f06bf7a..db2d75be87cc 100644 --- a/security/security.c +++ b/security/security.c @@ -1775,7 +1775,7 @@ EXPORT_SYMBOL(security_dentry_init_security); * Return: Returns 0 on success, error on failure. */ int security_dentry_create_files_as(struct dentry *dentry, int mode, - struct qstr *name, + const struct qstr *name, const struct cred *old, struct cred *new) { return call_int_hook(dentry_create_files_as, dentry, mode, diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..58ce49954206 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2901,7 +2901,7 @@ static int selinux_dentry_init_security(struct dentry *dentry, int mode, } static int selinux_dentry_create_files_as(struct dentry *dentry, int mode, - struct qstr *name, + const struct qstr *name, const struct cred *old, struct cred *new) { diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fc340a6f0dde..5caa372ffbf3 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4908,7 +4908,7 @@ static int smack_inode_copy_up_xattr(struct dentry *src, const char *name) } static int smack_dentry_create_files_as(struct dentry *dentry, int mode, - struct qstr *name, + const struct qstr *name, const struct cred *old, struct cred *new) { -- cgit v1.2.3 From ca97d6c60b1d1dff519a7e3dd86708304e657365 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 11 Jul 2025 05:45:01 -0400 Subject: generic_ci_validate_strict_name(): constify name argument Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/fs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..6dcfc1c399ca 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3719,7 +3719,8 @@ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, * happens when a directory is casefolded and the filesystem is strict * about its encoding. */ -static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) +static inline bool generic_ci_validate_strict_name(struct inode *dir, + const struct qstr *name) { if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb)) return true; @@ -3734,7 +3735,8 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qst return !utf8_validate(dir->i_sb->s_encoding, name); } #else -static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) +static inline bool generic_ci_validate_strict_name(struct inode *dir, + const struct qstr *name) { return true; } -- cgit v1.2.3 From 180a9cc3fd6a020746fbd7f97b9b62295a325fd2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 9 Feb 2024 14:57:43 -0500 Subject: make it easier to catch those who try to modify ->d_name Turn d_name into an anon union of const struct qstr d_name with struct qstr __d_name. Very few places need to modify it (all in fs/dcache.c); those are switched to use of ->__d_name. Note that ->d_name can actually change under you unless you have the right locking environment; this const just prohibits accidentally doing stores without being easily spotted. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/dcache.c | 26 +++++++++++++------------- include/linux/dcache.h | 5 ++++- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 60046ae23d51..b4cd5e1321b3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1717,13 +1717,13 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_shortname.string; } - dentry->d_name.len = name->len; - dentry->d_name.hash = name->hash; + dentry->__d_name.len = name->len; + dentry->__d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ - smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ + smp_store_release(&dentry->__d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; lockref_init(&dentry->d_lockref); @@ -2743,15 +2743,15 @@ static void swap_names(struct dentry *dentry, struct dentry *target) /* * Both external: swap the pointers */ - swap(target->d_name.name, dentry->d_name.name); + swap(target->__d_name.name, dentry->__d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ - dentry->d_name.name = target->d_name.name; + dentry->__d_name.name = target->__d_name.name; target->d_shortname = dentry->d_shortname; - target->d_name.name = target->d_shortname.string; + target->__d_name.name = target->d_shortname.string; } } else { if (unlikely(dname_external(dentry))) { @@ -2759,9 +2759,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target) * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ - target->d_name.name = dentry->d_name.name; + target->__d_name.name = dentry->__d_name.name; dentry->d_shortname = target->d_shortname; - dentry->d_name.name = dentry->d_shortname.string; + dentry->__d_name.name = dentry->d_shortname.string; } else { /* * Both are internal. @@ -2771,7 +2771,7 @@ static void swap_names(struct dentry *dentry, struct dentry *target) target->d_shortname.words[i]); } } - swap(dentry->d_name.hash_len, target->d_name.hash_len); + swap(dentry->__d_name.hash_len, target->__d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) @@ -2781,11 +2781,11 @@ static void copy_name(struct dentry *dentry, struct dentry *target) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->count); - dentry->d_name = target->d_name; + dentry->__d_name = target->__d_name; } else { dentry->d_shortname = target->d_shortname; - dentry->d_name.name = dentry->d_shortname.string; - dentry->d_name.hash_len = target->d_name.hash_len; + dentry->__d_name.name = dentry->d_shortname.string; + dentry->__d_name.hash_len = target->__d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->count))) kfree_rcu(old_name, head); @@ -3133,7 +3133,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode) !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu", + dentry->__d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index cc3e1c1a3454..c83e02b94389 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -95,7 +95,10 @@ struct dentry { seqcount_spinlock_t d_seq; /* per dentry seqlock */ struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ - struct qstr d_name; + union { + struct qstr __d_name; /* for use ONLY in fs/dcache.c */ + const struct qstr d_name; + }; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ union shortname_store d_shortname; -- cgit v1.2.3 From dae575e669811b201114702d96f6854d5c8324b5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Sep 2025 23:16:35 -0400 Subject: backing_file_user_path(): constify struct path * Callers never use the resulting pointer to modify the struct path it points to (nor should they). Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/file_table.c | 2 +- include/linux/fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/file_table.c b/fs/file_table.c index 81c72576e548..85b53e39138d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -54,7 +54,7 @@ struct backing_file { #define backing_file(f) container_of(f, struct backing_file, file) -struct path *backing_file_user_path(const struct file *f) +const struct path *backing_file_user_path(const struct file *f) { return &backing_file(f)->user_path; } diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..3bcc878817be 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2879,7 +2879,7 @@ struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); -struct path *backing_file_user_path(const struct file *f); +const struct path *backing_file_user_path(const struct file *f); /* * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file -- cgit v1.2.3 From 63dbfb077cdad21b356e17d4ce76650e67b83159 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 21:58:05 -0400 Subject: done_path_create(): constify path argument Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namei.c | 2 +- include/linux/namei.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 869976213b0c..3eb0408e3400 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4170,7 +4170,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, } EXPORT_SYMBOL(kern_path_create); -void done_path_create(struct path *path, struct dentry *dentry) +void done_path_create(const struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); diff --git a/include/linux/namei.h b/include/linux/namei.h index 5d085428e471..75c0b665fbd4 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -60,7 +60,7 @@ extern int kern_path(const char *, unsigned, struct path *); extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); -extern void done_path_create(struct path *, struct dentry *); +extern void done_path_create(const struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); extern struct dentry *kern_path_locked_negative(const char *, struct path *); extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); -- cgit v1.2.3 From 2930afe2c9cb9aec329269e40c851bf56cdcc09c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 21:32:41 -0400 Subject: export_operations->open(): constify path argument for the method and its sole instance... Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/pidfs.c | 2 +- include/linux/exportfs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/pidfs.c b/fs/pidfs.c index 108e7527f837..5af4fee288ea 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -847,7 +847,7 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } -static struct file *pidfs_export_open(struct path *path, unsigned int oflags) +static struct file *pidfs_export_open(const struct path *path, unsigned int oflags) { /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cfb0dd1ea49c..f43c83e0b8c5 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -270,7 +270,7 @@ struct export_operations { int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags); - struct file * (*open)(struct path *path, unsigned int oflags); + struct file * (*open)(const struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ -- cgit v1.2.3 From 1f6df5847454dee8608f78ee0df7352472cb2447 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 6 Jul 2025 18:45:02 -0400 Subject: drop_collected_paths(): constify arguments ... and use that to constify the pointers in callers Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 2 +- kernel/audit_tree.c | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 62cffd3b6de2..346e577073bb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2334,9 +2334,9 @@ struct path *collect_paths(const struct path *path, return res; } -void drop_collected_paths(struct path *paths, struct path *prealloc) +void drop_collected_paths(const struct path *paths, struct path *prealloc) { - for (struct path *p = paths; p->mnt; p++) + for (const struct path *p = paths; p->mnt; p++) path_put(p); if (paths != prealloc) kfree(paths); diff --git a/include/linux/mount.h b/include/linux/mount.h index 5f9c053b0897..c09032463b36 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -105,7 +105,7 @@ extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct path *collect_paths(const struct path *, struct path *, unsigned); -extern void drop_collected_paths(struct path *, struct path *); +extern void drop_collected_paths(const struct path *, struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index b0eae2a3c895..32007edf0e55 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -678,7 +678,7 @@ void audit_trim_trees(void) struct audit_tree *tree; struct path path; struct audit_node *node; - struct path *paths; + const struct path *paths; struct path array[16]; int err; @@ -701,7 +701,7 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { struct inode *inode = p->dentry->d_inode; if (inode_to_key(inode) == chunk->key) { node->index &= ~(1U<<31); @@ -740,9 +740,9 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mounts(struct path *paths, struct audit_tree *tree) +static int tag_mounts(const struct path *paths, struct audit_tree *tree) { - for (struct path *p = paths; p->dentry; p++) { + for (const struct path *p = paths; p->dentry; p++) { int err = tag_chunk(p->dentry->d_inode, tree); if (err) return err; @@ -805,7 +805,7 @@ int audit_add_tree_rule(struct audit_krule *rule) struct audit_tree *seed = rule->tree, *tree; struct path path; struct path array[16]; - struct path *paths; + const struct path *paths; int err; rule->tree = NULL; @@ -877,7 +877,7 @@ int audit_tag_tree(char *old, char *new) int failed = 0; struct path path1, path2; struct path array[16]; - struct path *paths; + const struct path *paths; int err; err = kern_path(new, 0, &path2); -- cgit v1.2.3 From b42ffcd5069d5cfb777b8982a1c55c7e2f1d3998 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Aug 2025 19:34:37 -0400 Subject: collect_paths(): constify the return value callers have no business modifying the paths they get Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 346e577073bb..cf8dbf0741f0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2300,7 +2300,7 @@ static inline bool extend_array(struct path **res, struct path **to_free, return p; } -struct path *collect_paths(const struct path *path, +const struct path *collect_paths(const struct path *path, struct path *prealloc, unsigned count) { struct mount *root = real_mount(path->mnt); @@ -2334,7 +2334,7 @@ struct path *collect_paths(const struct path *path, return res; } -void drop_collected_paths(const struct path *paths, struct path *prealloc) +void drop_collected_paths(const struct path *paths, const struct path *prealloc) { for (const struct path *p = paths; p->mnt; p++) path_put(p); diff --git a/include/linux/mount.h b/include/linux/mount.h index c09032463b36..18e4b97f8a98 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -104,8 +104,8 @@ extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); -extern struct path *collect_paths(const struct path *, struct path *, unsigned); -extern void drop_collected_paths(const struct path *, struct path *); +extern const struct path *collect_paths(const struct path *, struct path *, unsigned); +extern void drop_collected_paths(const struct path *, const struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); -- cgit v1.2.3 From 880fcc329e2473ba02ffbc446fcd403972ab1fca Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Tue, 9 Sep 2025 00:03:24 -0700 Subject: drivers/perf: riscv: Export PMU event info function The event mapping function can be used in event info function to find out the corresponding SBI PMU event encoding during the get_event_info function as well. Refactor and export it so that it can be invoked from kvm and internal driver. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Acked-by: Paul Walmsley Link: https://lore.kernel.org/r/20250909-pmu_event_info-v6-5-d8f80cacb884@rivosinc.com Signed-off-by: Anup Patel --- drivers/perf/riscv_pmu_sbi.c | 122 ++++++++++++++++++++++------------------- include/linux/perf/riscv_pmu.h | 1 + 2 files changed, 68 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index a6c479f853e1..0392900d828e 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -100,6 +100,7 @@ static unsigned int riscv_pmu_irq; /* Cache the available counters in a bitmask */ static unsigned long cmask; +static int pmu_event_find_cache(u64 config); struct sbi_pmu_event_data { union { union { @@ -412,6 +413,71 @@ static bool pmu_sbi_ctr_is_fw(int cidx) return (info->type == SBI_PMU_CTR_TYPE_FW) ? true : false; } +int riscv_pmu_get_event_info(u32 type, u64 config, u64 *econfig) +{ + int ret = -ENOENT; + + switch (type) { + case PERF_TYPE_HARDWARE: + if (config >= PERF_COUNT_HW_MAX) + return -EINVAL; + ret = pmu_hw_event_map[config].event_idx; + break; + case PERF_TYPE_HW_CACHE: + ret = pmu_event_find_cache(config); + break; + case PERF_TYPE_RAW: + /* + * As per SBI v0.3 specification, + * -- the upper 16 bits must be unused for a hardware raw event. + * As per SBI v2.0 specification, + * -- the upper 8 bits must be unused for a hardware raw event. + * Bits 63:62 are used to distinguish between raw events + * 00 - Hardware raw event + * 10 - SBI firmware events + * 11 - Risc-V platform specific firmware event + */ + switch (config >> 62) { + case 0: + if (sbi_v3_available) { + /* Return error any bits [56-63] is set as it is not allowed by the spec */ + if (!(config & ~RISCV_PMU_RAW_EVENT_V2_MASK)) { + if (econfig) + *econfig = config & RISCV_PMU_RAW_EVENT_V2_MASK; + ret = RISCV_PMU_RAW_EVENT_V2_IDX; + } + /* Return error any bits [48-63] is set as it is not allowed by the spec */ + } else if (!(config & ~RISCV_PMU_RAW_EVENT_MASK)) { + if (econfig) + *econfig = config & RISCV_PMU_RAW_EVENT_MASK; + ret = RISCV_PMU_RAW_EVENT_IDX; + } + break; + case 2: + ret = (config & 0xFFFF) | (SBI_PMU_EVENT_TYPE_FW << 16); + break; + case 3: + /* + * For Risc-V platform specific firmware events + * Event code - 0xFFFF + * Event data - raw event encoding + */ + ret = SBI_PMU_EVENT_TYPE_FW << 16 | RISCV_PLAT_FW_EVENT; + if (econfig) + *econfig = config & RISCV_PMU_PLAT_FW_EVENT_MASK; + break; + default: + break; + } + break; + default: + break; + } + + return ret; +} +EXPORT_SYMBOL_GPL(riscv_pmu_get_event_info); + /* * Returns the counter width of a programmable counter and number of hardware * counters. As we don't support heterogeneous CPUs yet, it is okay to just @@ -577,7 +643,6 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) { u32 type = event->attr.type; u64 config = event->attr.config; - int ret = -ENOENT; /* * Ensure we are finished checking standard hardware events for @@ -585,60 +650,7 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig) */ flush_work(&check_std_events_work); - switch (type) { - case PERF_TYPE_HARDWARE: - if (config >= PERF_COUNT_HW_MAX) - return -EINVAL; - ret = pmu_hw_event_map[event->attr.config].event_idx; - break; - case PERF_TYPE_HW_CACHE: - ret = pmu_event_find_cache(config); - break; - case PERF_TYPE_RAW: - /* - * As per SBI v0.3 specification, - * -- the upper 16 bits must be unused for a hardware raw event. - * As per SBI v2.0 specification, - * -- the upper 8 bits must be unused for a hardware raw event. - * Bits 63:62 are used to distinguish between raw events - * 00 - Hardware raw event - * 10 - SBI firmware events - * 11 - Risc-V platform specific firmware event - */ - - switch (config >> 62) { - case 0: - if (sbi_v3_available) { - if (!(config & ~RISCV_PMU_RAW_EVENT_V2_MASK)) { - *econfig = config & RISCV_PMU_RAW_EVENT_V2_MASK; - ret = RISCV_PMU_RAW_EVENT_V2_IDX; - } - } else if (!(config & ~RISCV_PMU_RAW_EVENT_MASK)) { - *econfig = config & RISCV_PMU_RAW_EVENT_MASK; - ret = RISCV_PMU_RAW_EVENT_IDX; - } - break; - case 2: - ret = (config & 0xFFFF) | (SBI_PMU_EVENT_TYPE_FW << 16); - break; - case 3: - /* - * For Risc-V platform specific firmware events - * Event code - 0xFFFF - * Event data - raw event encoding - */ - ret = SBI_PMU_EVENT_TYPE_FW << 16 | RISCV_PLAT_FW_EVENT; - *econfig = config & RISCV_PMU_PLAT_FW_EVENT_MASK; - break; - default: - break; - } - break; - default: - break; - } - - return ret; + return riscv_pmu_get_event_info(type, config, econfig); } static void pmu_sbi_snapshot_free(struct riscv_pmu *pmu) diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h index 701974639ff2..f82a28040594 100644 --- a/include/linux/perf/riscv_pmu.h +++ b/include/linux/perf/riscv_pmu.h @@ -89,6 +89,7 @@ static inline void riscv_pmu_legacy_skip_init(void) {}; struct riscv_pmu *riscv_pmu_alloc(void); #ifdef CONFIG_RISCV_PMU_SBI int riscv_pmu_get_hpm_info(u32 *hw_ctr_width, u32 *num_hw_ctr); +int riscv_pmu_get_event_info(u32 type, u64 config, u64 *econfig); #endif #endif /* CONFIG_RISCV_PMU */ -- cgit v1.2.3 From 94deac977fbd0246c971b4f1d17a6385f5e0b1a4 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Mon, 1 Sep 2025 10:52:04 +0000 Subject: fs: add an enum for number of life time hints Add WRITE_LIFE_HINT_NR into the rw_hint enum to define the number of values write life time hints can be set to. This is useful for e.g. file systems which may want to map these values to allocation groups. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Signed-off-by: Carlos Maiolino --- include/linux/rw_hint.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h index 309ca72f2dfb..adcc43042c90 100644 --- a/include/linux/rw_hint.h +++ b/include/linux/rw_hint.h @@ -14,6 +14,7 @@ enum rw_hint { WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, + WRITE_LIFE_HINT_NR, } __packed; /* Sparse ignores __packed annotations on enums, hence the #ifndef below. */ -- cgit v1.2.3 From 51dad33ede63618a6b425c650f3042d85e646dac Mon Sep 17 00:00:00 2001 From: Ming Yu Date: Fri, 12 Sep 2025 17:19:46 +0800 Subject: mfd: Add core driver for Nuvoton NCT6694 The Nuvoton NCT6694 provides an USB interface to the host to access its features. Sub-devices can use the USB functions nct6694_read_msg() and nct6694_write_msg() to issue a command. They can also request interrupt that will be called when the USB device receives its interrupt pipe. Signed-off-by: Ming Yu Link: https://lore.kernel.org/r/20250912091952.1169369-2-a0282524688@gmail.com Signed-off-by: Lee Jones --- MAINTAINERS | 6 + drivers/mfd/Kconfig | 15 ++ drivers/mfd/Makefile | 2 + drivers/mfd/nct6694.c | 388 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/nct6694.h | 102 ++++++++++++ 5 files changed, 513 insertions(+) create mode 100644 drivers/mfd/nct6694.c create mode 100644 include/linux/mfd/nct6694.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa4..a8a05872d077 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18082,6 +18082,12 @@ F: drivers/nubus/ F: include/linux/nubus.h F: include/uapi/linux/nubus.h +NUVOTON NCT6694 MFD DRIVER +M: Ming Yu +S: Supported +F: drivers/mfd/nct6694.c +F: include/linux/mfd/nct6694.h + NUVOTON NCT7201 IIO DRIVER M: Eason Yang L: linux-iio@vger.kernel.org diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1..f3d157776e93 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1134,6 +1134,21 @@ config MFD_MENF21BMC This driver can also be built as a module. If so the module will be called menf21bmc. +config MFD_NCT6694 + tristate "Nuvoton NCT6694 support" + select MFD_CORE + depends on USB + help + This enables support for the Nuvoton USB device NCT6694, which shares + peripherals. + The Nuvoton NCT6694 is a peripheral expander with 16 GPIO chips, + 6 I2C controllers, 2 CANfd controllers, 2 Watchdog timers, ADC, + PWM, and RTC. + This driver provides core APIs to access the NCT6694 hardware + monitoring and control features. + Additional drivers must be enabled to utilize the specific + functionalities of the device. + config MFD_OCELOT tristate "Microsemi Ocelot External Control Support" depends on SPI_MASTER diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d..1e7738c02b2c 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -121,6 +121,8 @@ obj-$(CONFIG_MFD_MC13XXX) += mc13xxx-core.o obj-$(CONFIG_MFD_MC13XXX_SPI) += mc13xxx-spi.o obj-$(CONFIG_MFD_MC13XXX_I2C) += mc13xxx-i2c.o +obj-$(CONFIG_MFD_NCT6694) += nct6694.o + obj-$(CONFIG_MFD_CORE) += mfd-core.o ocelot-soc-objs := ocelot-core.o ocelot-spi.o diff --git a/drivers/mfd/nct6694.c b/drivers/mfd/nct6694.c new file mode 100644 index 000000000000..308b2fda3055 --- /dev/null +++ b/drivers/mfd/nct6694.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Nuvoton Technology Corp. + * + * Nuvoton NCT6694 core driver using USB interface to provide + * access to the NCT6694 hardware monitoring and control features. + * + * The NCT6694 is an integrated controller that provides GPIO, I2C, + * CAN, WDT, HWMON and RTC management. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell nct6694_devs[] = { + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + MFD_CELL_NAME("nct6694-gpio"), + + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + MFD_CELL_NAME("nct6694-i2c"), + + MFD_CELL_NAME("nct6694-canfd"), + MFD_CELL_NAME("nct6694-canfd"), + + MFD_CELL_NAME("nct6694-wdt"), + MFD_CELL_NAME("nct6694-wdt"), + + MFD_CELL_NAME("nct6694-hwmon"), + + MFD_CELL_NAME("nct6694-rtc"), +}; + +static int nct6694_response_err_handling(struct nct6694 *nct6694, unsigned char err_status) +{ + switch (err_status) { + case NCT6694_NO_ERROR: + return 0; + case NCT6694_NOT_SUPPORT_ERROR: + dev_err(nct6694->dev, "Command is not supported!\n"); + break; + case NCT6694_NO_RESPONSE_ERROR: + dev_warn(nct6694->dev, "Command received no response!\n"); + break; + case NCT6694_TIMEOUT_ERROR: + dev_warn(nct6694->dev, "Command timed out!\n"); + break; + case NCT6694_PENDING: + dev_err(nct6694->dev, "Command is pending!\n"); + break; + default: + return -EINVAL; + } + + return -EIO; +} + +/** + * nct6694_read_msg() - Read message from NCT6694 device + * @nct6694: NCT6694 device pointer + * @cmd_hd: command header structure + * @buf: buffer to store the response data + * + * Sends a command to the NCT6694 device and reads the response. + * The command header is specified in @cmd_hd, and the response + * data is stored in @buf. + * + * Return: Negative value on error or 0 on success. + */ +int nct6694_read_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf) +{ + union nct6694_usb_msg *msg = nct6694->usb_msg; + struct usb_device *udev = nct6694->udev; + int tx_len, rx_len, ret; + + guard(mutex)(&nct6694->access_lock); + + memcpy(&msg->cmd_header, cmd_hd, sizeof(*cmd_hd)); + msg->cmd_header.hctrl = NCT6694_HCTRL_GET; + + /* Send command packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), &msg->cmd_header, + sizeof(*msg), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive response packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), &msg->response_header, + sizeof(*msg), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive data packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), buf, + le16_to_cpu(cmd_hd->len), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + if (rx_len != le16_to_cpu(cmd_hd->len)) { + dev_err(nct6694->dev, "Expected received length %d, but got %d\n", + le16_to_cpu(cmd_hd->len), rx_len); + return -EIO; + } + + return nct6694_response_err_handling(nct6694, msg->response_header.sts); +} +EXPORT_SYMBOL_GPL(nct6694_read_msg); + +/** + * nct6694_write_msg() - Write message to NCT6694 device + * @nct6694: NCT6694 device pointer + * @cmd_hd: command header structure + * @buf: buffer containing the data to be sent + * + * Sends a command to the NCT6694 device and writes the data + * from @buf. The command header is specified in @cmd_hd. + * + * Return: Negative value on error or 0 on success. + */ +int nct6694_write_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf) +{ + union nct6694_usb_msg *msg = nct6694->usb_msg; + struct usb_device *udev = nct6694->udev; + int tx_len, rx_len, ret; + + guard(mutex)(&nct6694->access_lock); + + memcpy(&msg->cmd_header, cmd_hd, sizeof(*cmd_hd)); + msg->cmd_header.hctrl = NCT6694_HCTRL_SET; + + /* Send command packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), &msg->cmd_header, + sizeof(*msg), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Send data packet to USB device */ + ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, NCT6694_BULK_OUT_EP), buf, + le16_to_cpu(cmd_hd->len), &tx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive response packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), &msg->response_header, + sizeof(*msg), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + /* Receive data packet from USB device */ + ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, NCT6694_BULK_IN_EP), buf, + le16_to_cpu(cmd_hd->len), &rx_len, NCT6694_URB_TIMEOUT); + if (ret) + return ret; + + if (rx_len != le16_to_cpu(cmd_hd->len)) { + dev_err(nct6694->dev, "Expected transmitted length %d, but got %d\n", + le16_to_cpu(cmd_hd->len), rx_len); + return -EIO; + } + + return nct6694_response_err_handling(nct6694, msg->response_header.sts); +} +EXPORT_SYMBOL_GPL(nct6694_write_msg); + +static void usb_int_callback(struct urb *urb) +{ + struct nct6694 *nct6694 = urb->context; + __le32 *status_le = urb->transfer_buffer; + u32 int_status; + int ret; + + switch (urb->status) { + case 0: + break; + case -ECONNRESET: + case -ENOENT: + case -ESHUTDOWN: + return; + default: + goto resubmit; + } + + int_status = le32_to_cpu(*status_le); + + while (int_status) { + int irq = __ffs(int_status); + + generic_handle_irq_safe(irq_find_mapping(nct6694->domain, irq)); + int_status &= ~BIT(irq); + } + +resubmit: + ret = usb_submit_urb(urb, GFP_ATOMIC); + if (ret) + dev_warn(nct6694->dev, "Failed to resubmit urb, status %pe", ERR_PTR(ret)); +} + +static void nct6694_irq_enable(struct irq_data *data) +{ + struct nct6694 *nct6694 = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + guard(spinlock_irqsave)(&nct6694->irq_lock); + + nct6694->irq_enable |= BIT(hwirq); +} + +static void nct6694_irq_disable(struct irq_data *data) +{ + struct nct6694 *nct6694 = irq_data_get_irq_chip_data(data); + irq_hw_number_t hwirq = irqd_to_hwirq(data); + + guard(spinlock_irqsave)(&nct6694->irq_lock); + + nct6694->irq_enable &= ~BIT(hwirq); +} + +static const struct irq_chip nct6694_irq_chip = { + .name = "nct6694-irq", + .flags = IRQCHIP_SKIP_SET_WAKE, + .irq_enable = nct6694_irq_enable, + .irq_disable = nct6694_irq_disable, +}; + +static int nct6694_irq_domain_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) +{ + struct nct6694 *nct6694 = d->host_data; + + irq_set_chip_data(irq, nct6694); + irq_set_chip_and_handler(irq, &nct6694_irq_chip, handle_simple_irq); + + return 0; +} + +static void nct6694_irq_domain_unmap(struct irq_domain *d, unsigned int irq) +{ + irq_set_chip_and_handler(irq, NULL, NULL); + irq_set_chip_data(irq, NULL); +} + +static const struct irq_domain_ops nct6694_irq_domain_ops = { + .map = nct6694_irq_domain_map, + .unmap = nct6694_irq_domain_unmap, +}; + +static int nct6694_usb_probe(struct usb_interface *iface, + const struct usb_device_id *id) +{ + struct usb_device *udev = interface_to_usbdev(iface); + struct usb_endpoint_descriptor *int_endpoint; + struct usb_host_interface *interface; + struct device *dev = &iface->dev; + struct nct6694 *nct6694; + int ret; + + nct6694 = devm_kzalloc(dev, sizeof(*nct6694), GFP_KERNEL); + if (!nct6694) + return -ENOMEM; + + nct6694->usb_msg = devm_kzalloc(dev, sizeof(union nct6694_usb_msg), GFP_KERNEL); + if (!nct6694->usb_msg) + return -ENOMEM; + + nct6694->int_buffer = devm_kzalloc(dev, sizeof(*nct6694->int_buffer), GFP_KERNEL); + if (!nct6694->int_buffer) + return -ENOMEM; + + nct6694->int_in_urb = usb_alloc_urb(0, GFP_KERNEL); + if (!nct6694->int_in_urb) + return -ENOMEM; + + nct6694->domain = irq_domain_create_simple(NULL, NCT6694_NR_IRQS, 0, + &nct6694_irq_domain_ops, + nct6694); + if (!nct6694->domain) { + ret = -ENODEV; + goto err_urb; + } + + nct6694->dev = dev; + nct6694->udev = udev; + + ida_init(&nct6694->gpio_ida); + ida_init(&nct6694->i2c_ida); + ida_init(&nct6694->canfd_ida); + ida_init(&nct6694->wdt_ida); + + spin_lock_init(&nct6694->irq_lock); + + ret = devm_mutex_init(dev, &nct6694->access_lock); + if (ret) + goto err_ida; + + interface = iface->cur_altsetting; + + int_endpoint = &interface->endpoint[0].desc; + if (!usb_endpoint_is_int_in(int_endpoint)) { + ret = -ENODEV; + goto err_ida; + } + + usb_fill_int_urb(nct6694->int_in_urb, udev, usb_rcvintpipe(udev, NCT6694_INT_IN_EP), + nct6694->int_buffer, sizeof(*nct6694->int_buffer), usb_int_callback, + nct6694, int_endpoint->bInterval); + + ret = usb_submit_urb(nct6694->int_in_urb, GFP_KERNEL); + if (ret) + goto err_ida; + + usb_set_intfdata(iface, nct6694); + + ret = mfd_add_hotplug_devices(dev, nct6694_devs, ARRAY_SIZE(nct6694_devs)); + if (ret) + goto err_mfd; + + return 0; + +err_mfd: + usb_kill_urb(nct6694->int_in_urb); +err_ida: + ida_destroy(&nct6694->wdt_ida); + ida_destroy(&nct6694->canfd_ida); + ida_destroy(&nct6694->i2c_ida); + ida_destroy(&nct6694->gpio_ida); + irq_domain_remove(nct6694->domain); +err_urb: + usb_free_urb(nct6694->int_in_urb); + return ret; +} + +static void nct6694_usb_disconnect(struct usb_interface *iface) +{ + struct nct6694 *nct6694 = usb_get_intfdata(iface); + + mfd_remove_devices(nct6694->dev); + usb_kill_urb(nct6694->int_in_urb); + ida_destroy(&nct6694->wdt_ida); + ida_destroy(&nct6694->canfd_ida); + ida_destroy(&nct6694->i2c_ida); + ida_destroy(&nct6694->gpio_ida); + irq_domain_remove(nct6694->domain); + usb_free_urb(nct6694->int_in_urb); +} + +static const struct usb_device_id nct6694_ids[] = { + { USB_DEVICE_AND_INTERFACE_INFO(NCT6694_VENDOR_ID, NCT6694_PRODUCT_ID, 0xFF, 0x00, 0x00) }, + { } +}; +MODULE_DEVICE_TABLE(usb, nct6694_ids); + +static struct usb_driver nct6694_usb_driver = { + .name = "nct6694", + .id_table = nct6694_ids, + .probe = nct6694_usb_probe, + .disconnect = nct6694_usb_disconnect, +}; +module_usb_driver(nct6694_usb_driver); + +MODULE_DESCRIPTION("Nuvoton NCT6694 core driver"); +MODULE_AUTHOR("Ming Yu "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/nct6694.h b/include/linux/mfd/nct6694.h new file mode 100644 index 000000000000..6eb9be2cd4a0 --- /dev/null +++ b/include/linux/mfd/nct6694.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Nuvoton Technology Corp. + * + * Nuvoton NCT6694 USB transaction and data structure. + */ + +#ifndef __MFD_NCT6694_H +#define __MFD_NCT6694_H + +#define NCT6694_VENDOR_ID 0x0416 +#define NCT6694_PRODUCT_ID 0x200B +#define NCT6694_INT_IN_EP 0x81 +#define NCT6694_BULK_IN_EP 0x02 +#define NCT6694_BULK_OUT_EP 0x03 + +#define NCT6694_HCTRL_SET 0x40 +#define NCT6694_HCTRL_GET 0x80 + +#define NCT6694_URB_TIMEOUT 1000 + +enum nct6694_irq_id { + NCT6694_IRQ_GPIO0 = 0, + NCT6694_IRQ_GPIO1, + NCT6694_IRQ_GPIO2, + NCT6694_IRQ_GPIO3, + NCT6694_IRQ_GPIO4, + NCT6694_IRQ_GPIO5, + NCT6694_IRQ_GPIO6, + NCT6694_IRQ_GPIO7, + NCT6694_IRQ_GPIO8, + NCT6694_IRQ_GPIO9, + NCT6694_IRQ_GPIOA, + NCT6694_IRQ_GPIOB, + NCT6694_IRQ_GPIOC, + NCT6694_IRQ_GPIOD, + NCT6694_IRQ_GPIOE, + NCT6694_IRQ_GPIOF, + NCT6694_IRQ_CAN0, + NCT6694_IRQ_CAN1, + NCT6694_IRQ_RTC, + NCT6694_NR_IRQS, +}; + +enum nct6694_response_err_status { + NCT6694_NO_ERROR = 0, + NCT6694_FORMAT_ERROR, + NCT6694_RESERVED1, + NCT6694_RESERVED2, + NCT6694_NOT_SUPPORT_ERROR, + NCT6694_NO_RESPONSE_ERROR, + NCT6694_TIMEOUT_ERROR, + NCT6694_PENDING, +}; + +struct __packed nct6694_cmd_header { + u8 rsv1; + u8 mod; + union __packed { + __le16 offset; + struct __packed { + u8 cmd; + u8 sel; + }; + }; + u8 hctrl; + u8 rsv2; + __le16 len; +}; + +struct __packed nct6694_response_header { + u8 sequence_id; + u8 sts; + u8 reserved[4]; + __le16 len; +}; + +union __packed nct6694_usb_msg { + struct nct6694_cmd_header cmd_header; + struct nct6694_response_header response_header; +}; + +struct nct6694 { + struct device *dev; + struct ida gpio_ida; + struct ida i2c_ida; + struct ida canfd_ida; + struct ida wdt_ida; + struct irq_domain *domain; + struct mutex access_lock; + spinlock_t irq_lock; + struct urb *int_in_urb; + struct usb_device *udev; + union nct6694_usb_msg *usb_msg; + __le32 *int_buffer; + unsigned int irq_enable; +}; + +int nct6694_read_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf); +int nct6694_write_msg(struct nct6694 *nct6694, const struct nct6694_cmd_header *cmd_hd, void *buf); + +#endif -- cgit v1.2.3 From a22ddeef55c4df847d9ac862b6192da774948fe1 Mon Sep 17 00:00:00 2001 From: Kamel Bouhara Date: Sun, 24 Aug 2025 13:57:21 +0200 Subject: mfd: Add max7360 support Add core driver to support MAX7360 i2c chip, multi function device with keypad, GPIO, PWM, GPO and rotary encoder submodules. Signed-off-by: Kamel Bouhara Co-developed-by: Mathieu Dubois-Briand Signed-off-by: Mathieu Dubois-Briand Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-2-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 14 ++++ drivers/mfd/Makefile | 1 + drivers/mfd/max7360.c | 171 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/max7360.h | 109 ++++++++++++++++++++++++++++ 4 files changed, 295 insertions(+) create mode 100644 drivers/mfd/max7360.c create mode 100644 include/linux/mfd/max7360.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 425c5fba6cb1..58b1c2900d59 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -2481,5 +2481,19 @@ config MFD_UPBOARD_FPGA To compile this driver as a module, choose M here: the module will be called upboard-fpga. +config MFD_MAX7360 + tristate "Maxim MAX7360 I2C IO Expander" + depends on I2C + select MFD_CORE + select REGMAP_I2C + select REGMAP_IRQ + help + Say yes here to add support for Maxim MAX7360 device, embedding + keypad, rotary encoder, PWM and GPIO features. + + This driver provides common support for accessing the device; + additional drivers must be enabled in order to use the functionality + of the device. + endmenu endif diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f7bdedd5a66d..c81c6a8473e1 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -163,6 +163,7 @@ obj-$(CONFIG_MFD_DA9063) += da9063.o obj-$(CONFIG_MFD_DA9150) += da9150-core.o obj-$(CONFIG_MFD_MAX14577) += max14577.o +obj-$(CONFIG_MFD_MAX7360) += max7360.o obj-$(CONFIG_MFD_MAX77541) += max77541.o obj-$(CONFIG_MFD_MAX77620) += max77620.o obj-$(CONFIG_MFD_MAX77650) += max77650.o diff --git a/drivers/mfd/max7360.c b/drivers/mfd/max7360.c new file mode 100644 index 000000000000..5ee459c490ec --- /dev/null +++ b/drivers/mfd/max7360.c @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Maxim MAX7360 Core Driver + * + * Copyright 2025 Bootlin + * + * Authors: + * Kamel Bouhara + * Mathieu Dubois-Briand + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct mfd_cell max7360_cells[] = { + { .name = "max7360-pinctrl" }, + { .name = "max7360-pwm" }, + { .name = "max7360-keypad" }, + { .name = "max7360-rotary" }, + { + .name = "max7360-gpo", + .of_compatible = "maxim,max7360-gpo", + }, + { + .name = "max7360-gpio", + .of_compatible = "maxim,max7360-gpio", + }, +}; + +static const struct regmap_range max7360_volatile_ranges[] = { + regmap_reg_range(MAX7360_REG_KEYFIFO, MAX7360_REG_KEYFIFO), + regmap_reg_range(MAX7360_REG_I2C_TIMEOUT, MAX7360_REG_RTR_CNT), +}; + +static const struct regmap_access_table max7360_volatile_table = { + .yes_ranges = max7360_volatile_ranges, + .n_yes_ranges = ARRAY_SIZE(max7360_volatile_ranges), +}; + +static const struct regmap_config max7360_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = MAX7360_REG_PWMCFG(MAX7360_PORT_PWM_COUNT - 1), + .volatile_table = &max7360_volatile_table, + .cache_type = REGCACHE_MAPLE, +}; + +static int max7360_mask_irqs(struct regmap *regmap) +{ + struct device *dev = regmap_get_device(regmap); + unsigned int val; + int ret; + + /* + * GPIO/PWM interrupts are not masked on reset: as the MAX7360 "INTI" + * interrupt line is shared between GPIOs and rotary encoder, this could + * result in repeated spurious interrupts on the rotary encoder driver + * if the GPIO driver is not loaded. Mask them now to avoid this + * situation. + */ + for (unsigned int i = 0; i < MAX7360_PORT_PWM_COUNT; i++) { + ret = regmap_write_bits(regmap, MAX7360_REG_PWMCFG(i), + MAX7360_PORT_CFG_INTERRUPT_MASK, + MAX7360_PORT_CFG_INTERRUPT_MASK); + if (ret) + return dev_err_probe(dev, ret, + "Failed to write MAX7360 port configuration\n"); + } + + /* Read GPIO in register, to ACK any pending IRQ. */ + ret = regmap_read(regmap, MAX7360_REG_GPIOIN, &val); + if (ret) + return dev_err_probe(dev, ret, "Failed to read GPIO values\n"); + + return 0; +} + +static int max7360_reset(struct regmap *regmap) +{ + struct device *dev = regmap_get_device(regmap); + int ret; + + ret = regmap_write(regmap, MAX7360_REG_GPIOCFG, MAX7360_GPIO_CFG_GPIO_RST); + if (ret) { + dev_err(dev, "Failed to reset GPIO configuration: %x\n", ret); + return ret; + } + + ret = regcache_drop_region(regmap, MAX7360_REG_GPIOCFG, MAX7360_REG_GPIO_LAST); + if (ret) { + dev_err(dev, "Failed to drop regmap cache: %x\n", ret); + return ret; + } + + ret = regmap_write(regmap, MAX7360_REG_SLEEP, 0); + if (ret) { + dev_err(dev, "Failed to reset autosleep configuration: %x\n", ret); + return ret; + } + + ret = regmap_write(regmap, MAX7360_REG_DEBOUNCE, 0); + if (ret) + dev_err(dev, "Failed to reset GPO port count: %x\n", ret); + + return ret; +} + +static int max7360_probe(struct i2c_client *client) +{ + struct device *dev = &client->dev; + struct regmap *regmap; + int ret; + + regmap = devm_regmap_init_i2c(client, &max7360_regmap_config); + if (IS_ERR(regmap)) + return dev_err_probe(dev, PTR_ERR(regmap), "Failed to initialise regmap\n"); + + ret = max7360_reset(regmap); + if (ret) + return dev_err_probe(dev, ret, "Failed to reset device\n"); + + /* Get the device out of shutdown mode. */ + ret = regmap_write_bits(regmap, MAX7360_REG_GPIOCFG, + MAX7360_GPIO_CFG_GPIO_EN, + MAX7360_GPIO_CFG_GPIO_EN); + if (ret) + return dev_err_probe(dev, ret, "Failed to enable GPIO and PWM module\n"); + + ret = max7360_mask_irqs(regmap); + if (ret) + return dev_err_probe(dev, ret, "Could not mask interrupts\n"); + + ret = devm_mfd_add_devices(dev, PLATFORM_DEVID_NONE, + max7360_cells, ARRAY_SIZE(max7360_cells), + NULL, 0, NULL); + if (ret) + return dev_err_probe(dev, ret, "Failed to register child devices\n"); + + return 0; +} + +static const struct of_device_id max7360_dt_match[] = { + { .compatible = "maxim,max7360" }, + {} +}; +MODULE_DEVICE_TABLE(of, max7360_dt_match); + +static struct i2c_driver max7360_driver = { + .driver = { + .name = "max7360", + .of_match_table = max7360_dt_match, + }, + .probe = max7360_probe, +}; +module_i2c_driver(max7360_driver); + +MODULE_DESCRIPTION("Maxim MAX7360 I2C IO Expander core driver"); +MODULE_AUTHOR("Kamel Bouhara "); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/max7360.h b/include/linux/mfd/max7360.h new file mode 100644 index 000000000000..44cf2bf651a2 --- /dev/null +++ b/include/linux/mfd/max7360.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __LINUX_MFD_MAX7360_H +#define __LINUX_MFD_MAX7360_H + +#include + +#define MAX7360_MAX_KEY_ROWS 8 +#define MAX7360_MAX_KEY_COLS 8 +#define MAX7360_MAX_KEY_NUM (MAX7360_MAX_KEY_ROWS * MAX7360_MAX_KEY_COLS) +#define MAX7360_ROW_SHIFT 3 + +#define MAX7360_MAX_GPIO 8 +#define MAX7360_MAX_GPO 6 +#define MAX7360_PORT_PWM_COUNT 8 +#define MAX7360_PORT_RTR_PIN (MAX7360_PORT_PWM_COUNT - 1) + +/* + * MAX7360 registers + */ +#define MAX7360_REG_KEYFIFO 0x00 +#define MAX7360_REG_CONFIG 0x01 +#define MAX7360_REG_DEBOUNCE 0x02 +#define MAX7360_REG_INTERRUPT 0x03 +#define MAX7360_REG_PORTS 0x04 +#define MAX7360_REG_KEYREP 0x05 +#define MAX7360_REG_SLEEP 0x06 + +/* + * MAX7360 GPIO registers + * + * All these registers are reset together when writing bit 3 of + * MAX7360_REG_GPIOCFG. + */ +#define MAX7360_REG_GPIOCFG 0x40 +#define MAX7360_REG_GPIOCTRL 0x41 +#define MAX7360_REG_GPIODEB 0x42 +#define MAX7360_REG_GPIOCURR 0x43 +#define MAX7360_REG_GPIOOUTM 0x44 +#define MAX7360_REG_PWMCOM 0x45 +#define MAX7360_REG_RTRCFG 0x46 +#define MAX7360_REG_I2C_TIMEOUT 0x48 +#define MAX7360_REG_GPIOIN 0x49 +#define MAX7360_REG_RTR_CNT 0x4A +#define MAX7360_REG_PWMBASE 0x50 +#define MAX7360_REG_PWMCFGBASE 0x58 + +#define MAX7360_REG_GPIO_LAST 0x5F + +#define MAX7360_REG_PWM(x) (MAX7360_REG_PWMBASE + (x)) +#define MAX7360_REG_PWMCFG(x) (MAX7360_REG_PWMCFGBASE + (x)) + +/* + * Configuration register bits + */ +#define MAX7360_FIFO_EMPTY 0x3F +#define MAX7360_FIFO_OVERFLOW 0x7F +#define MAX7360_FIFO_RELEASE BIT(6) +#define MAX7360_FIFO_COL GENMASK(5, 3) +#define MAX7360_FIFO_ROW GENMASK(2, 0) + +#define MAX7360_CFG_SLEEP BIT(7) +#define MAX7360_CFG_INTERRUPT BIT(5) +#define MAX7360_CFG_KEY_RELEASE BIT(3) +#define MAX7360_CFG_WAKEUP BIT(1) +#define MAX7360_CFG_TIMEOUT BIT(0) + +#define MAX7360_DEBOUNCE GENMASK(4, 0) +#define MAX7360_DEBOUNCE_MIN 9 +#define MAX7360_DEBOUNCE_MAX 40 +#define MAX7360_PORTS GENMASK(8, 5) + +#define MAX7360_INTERRUPT_TIME_MASK GENMASK(4, 0) +#define MAX7360_INTERRUPT_FIFO_MASK GENMASK(7, 5) + +#define MAX7360_PORT_CFG_INTERRUPT_MASK BIT(7) +#define MAX7360_PORT_CFG_INTERRUPT_EDGES BIT(6) +#define MAX7360_PORT_CFG_COMMON_PWM BIT(5) + +/* + * Autosleep register values + */ +#define MAX7360_AUTOSLEEP_8192MS 0x01 +#define MAX7360_AUTOSLEEP_4096MS 0x02 +#define MAX7360_AUTOSLEEP_2048MS 0x03 +#define MAX7360_AUTOSLEEP_1024MS 0x04 +#define MAX7360_AUTOSLEEP_512MS 0x05 +#define MAX7360_AUTOSLEEP_256MS 0x06 + +#define MAX7360_GPIO_CFG_RTR_EN BIT(7) +#define MAX7360_GPIO_CFG_GPIO_EN BIT(4) +#define MAX7360_GPIO_CFG_GPIO_RST BIT(3) + +#define MAX7360_ROT_DEBOUNCE GENMASK(3, 0) +#define MAX7360_ROT_DEBOUNCE_MIN 0 +#define MAX7360_ROT_DEBOUNCE_MAX 15 +#define MAX7360_ROT_INTCNT GENMASK(6, 4) +#define MAX7360_ROT_INTCNT_DLY BIT(7) + +#define MAX7360_INT_INTI 0 +#define MAX7360_INT_INTK 1 + +#define MAX7360_INT_GPIO 0 +#define MAX7360_INT_KEYPAD 1 +#define MAX7360_INT_ROTARY 2 + +#define MAX7360_NR_INTERNAL_IRQS 3 + +#endif -- cgit v1.2.3 From 553b75d4bfe9264f631d459fe9996744e0672b0e Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:24 +0200 Subject: gpio: regmap: Allow to allocate regmap-irq device GPIO controller often have support for IRQ: allow to easily allocate both gpio-regmap and regmap-irq in one operation. Reviewed-by: Andy Shevchenko Acked-by: Bartosz Golaszewski Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-5-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/gpio/gpio-regmap.c | 29 +++++++++++++++++++++++++++-- include/linux/gpio/regmap.h | 11 +++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c index e8a32dfebdcb..e1944931ee7c 100644 --- a/drivers/gpio/gpio-regmap.c +++ b/drivers/gpio/gpio-regmap.c @@ -32,6 +32,11 @@ struct gpio_regmap { unsigned int reg_dir_in_base; unsigned int reg_dir_out_base; +#ifdef CONFIG_REGMAP_IRQ + int regmap_irq_line; + struct regmap_irq_chip_data *irq_chip_data; +#endif + int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base, unsigned int offset, unsigned int *reg, unsigned int *mask); @@ -215,6 +220,7 @@ EXPORT_SYMBOL_GPL(gpio_regmap_get_drvdata); */ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config) { + struct irq_domain *irq_domain; struct gpio_regmap *gpio; struct gpio_chip *chip; int ret; @@ -295,8 +301,22 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config if (ret < 0) goto err_free_gpio; - if (config->irq_domain) { - ret = gpiochip_irqchip_add_domain(chip, config->irq_domain); +#ifdef CONFIG_REGMAP_IRQ + if (config->regmap_irq_chip) { + gpio->regmap_irq_line = config->regmap_irq_line; + ret = regmap_add_irq_chip_fwnode(dev_fwnode(config->parent), config->regmap, + config->regmap_irq_line, config->regmap_irq_flags, + 0, config->regmap_irq_chip, &gpio->irq_chip_data); + if (ret) + goto err_free_gpio; + + irq_domain = regmap_irq_get_domain(gpio->irq_chip_data); + } else +#endif + irq_domain = config->irq_domain; + + if (irq_domain) { + ret = gpiochip_irqchip_add_domain(chip, irq_domain); if (ret) goto err_remove_gpiochip; } @@ -317,6 +337,11 @@ EXPORT_SYMBOL_GPL(gpio_regmap_register); */ void gpio_regmap_unregister(struct gpio_regmap *gpio) { +#ifdef CONFIG_REGMAP_IRQ + if (gpio->irq_chip_data) + regmap_del_irq_chip(gpio->regmap_irq_line, gpio->irq_chip_data); +#endif + gpiochip_remove(&gpio->gpio_chip); kfree(gpio); } diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index c722c67668c6..19b52ac03a5d 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -40,6 +40,11 @@ struct regmap; * @drvdata: (Optional) Pointer to driver specific data which is * not used by gpio-remap but is provided "as is" to the * driver callback(s). + * @regmap_irq_chip: (Optional) Pointer on an regmap_irq_chip structure. If + * set, a regmap-irq device will be created and the IRQ + * domain will be set accordingly. + * @regmap_irq_line (Optional) The IRQ the device uses to signal interrupts. + * @regmap_irq_flags (Optional) The IRQF_ flags to use for the interrupt. * * The ->reg_mask_xlate translates a given base address and GPIO offset to * register and mask pair. The base address is one of the given register @@ -78,6 +83,12 @@ struct gpio_regmap_config { int ngpio_per_reg; struct irq_domain *irq_domain; +#ifdef CONFIG_REGMAP_IRQ + struct regmap_irq_chip *regmap_irq_chip; + int regmap_irq_line; + unsigned long regmap_irq_flags; +#endif + int (*reg_mask_xlate)(struct gpio_regmap *gpio, unsigned int base, unsigned int offset, unsigned int *reg, unsigned int *mask); -- cgit v1.2.3 From 0627b71fa5508ab605b6e9fd74baed40805cfdda Mon Sep 17 00:00:00 2001 From: Mathieu Dubois-Briand Date: Sun, 24 Aug 2025 13:57:25 +0200 Subject: gpio: regmap: Allow to provide init_valid_mask callback Allows to populate the gpio_regmap_config structure with init_valid_mask() callback to set on the final gpio_chip structure. Reviewed-by: Michael Walle Reviewed-by: Andy Shevchenko Reviewed-by: Linus Walleij Reviewed-by: Bartosz Golaszewski Signed-off-by: Mathieu Dubois-Briand Link: https://lore.kernel.org/r/20250824-mdb-max7360-support-v14-6-435cfda2b1ea@bootlin.com Signed-off-by: Lee Jones --- drivers/gpio/gpio-regmap.c | 1 + include/linux/gpio/regmap.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-regmap.c b/drivers/gpio/gpio-regmap.c index e1944931ee7c..d9d23853e032 100644 --- a/drivers/gpio/gpio-regmap.c +++ b/drivers/gpio/gpio-regmap.c @@ -261,6 +261,7 @@ struct gpio_regmap *gpio_regmap_register(const struct gpio_regmap_config *config chip->names = config->names; chip->label = config->label ?: dev_name(config->parent); chip->can_sleep = regmap_might_sleep(config->regmap); + chip->init_valid_mask = config->init_valid_mask; chip->request = gpiochip_generic_request; chip->free = gpiochip_generic_free; diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index 19b52ac03a5d..622a2939ebe0 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -6,6 +6,7 @@ struct device; struct fwnode_handle; struct gpio_regmap; +struct gpio_chip; struct irq_domain; struct regmap; @@ -40,6 +41,8 @@ struct regmap; * @drvdata: (Optional) Pointer to driver specific data which is * not used by gpio-remap but is provided "as is" to the * driver callback(s). + * @init_valid_mask: (Optional) Routine to initialize @valid_mask, to be used + * if not all GPIOs are valid. * @regmap_irq_chip: (Optional) Pointer on an regmap_irq_chip structure. If * set, a regmap-irq device will be created and the IRQ * domain will be set accordingly. @@ -93,6 +96,10 @@ struct gpio_regmap_config { unsigned int offset, unsigned int *reg, unsigned int *mask); + int (*init_valid_mask)(struct gpio_chip *gc, + unsigned long *valid_mask, + unsigned int ngpios); + void *drvdata; }; -- cgit v1.2.3 From f8d9e56aeb87ce82ce8636cd176cc59b69aa0e41 Mon Sep 17 00:00:00 2001 From: Jarkko Nikula Date: Fri, 22 Aug 2025 13:56:27 +0300 Subject: i3c: master: Add helpers for DMA mapping and bounce buffer handling Some I3C controllers such as MIPI I3C HCI may pad the last DWORD (32-bit) with stale data from the RX FIFO in DMA transfers if the receive length is not DWORD aligned and when the device DMA is IOMMU mapped. In such a case, a properly sized bounce buffer is required in order to avoid possible data corruption. In a review discussion, proposal was to have a common helpers in I3C core for DMA mapping and bounce buffer handling. Drivers may use the helper i3c_master_dma_map_single() to map a buffer for a DMA transfer. It internally allocates a bounce buffer if buffer is not DMA'able or when the driver requires it for a transfer. Helper i3c_master_dma_unmap_single() does the needed cleanups and data copying from the bounce buffer. Signed-off-by: Jarkko Nikula Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250822105630.2820009-2-jarkko.nikula@linux.intel.com Signed-off-by: Alexandre Belloni --- drivers/i3c/master.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/i3c/master.h | 26 ++++++++++++++++ 2 files changed, 100 insertions(+) (limited to 'include/linux') diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index 2ef898a8fd80..033d06cabca2 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1727,6 +1728,79 @@ int i3c_master_do_daa(struct i3c_master_controller *master) } EXPORT_SYMBOL_GPL(i3c_master_do_daa); +/** + * i3c_master_dma_map_single() - Map buffer for single DMA transfer + * @dev: device object of a device doing DMA + * @buf: destination/source buffer for DMA + * @len: length of transfer + * @force_bounce: true, force to use a bounce buffer, + * false, function will auto check is a bounce buffer required + * @dir: DMA direction + * + * Map buffer for a DMA transfer and allocate a bounce buffer if required. + * + * Return: I3C DMA transfer descriptor or NULL in case of error. + */ +struct i3c_dma *i3c_master_dma_map_single(struct device *dev, void *buf, + size_t len, bool force_bounce, enum dma_data_direction dir) +{ + struct i3c_dma *dma_xfer __free(kfree) = NULL; + void *bounce __free(kfree) = NULL; + void *dma_buf = buf; + + dma_xfer = kzalloc(sizeof(*dma_xfer), GFP_KERNEL); + if (!dma_xfer) + return NULL; + + dma_xfer->dev = dev; + dma_xfer->buf = buf; + dma_xfer->dir = dir; + dma_xfer->len = len; + dma_xfer->map_len = len; + + if (is_vmalloc_addr(buf)) + force_bounce = true; + + if (force_bounce) { + dma_xfer->map_len = ALIGN(len, cache_line_size()); + if (dir == DMA_FROM_DEVICE) + bounce = kzalloc(dma_xfer->map_len, GFP_KERNEL); + else + bounce = kmemdup(buf, dma_xfer->map_len, GFP_KERNEL); + if (!bounce) + return NULL; + dma_buf = bounce; + } + + dma_xfer->addr = dma_map_single(dev, dma_buf, dma_xfer->map_len, dir); + if (dma_mapping_error(dev, dma_xfer->addr)) + return NULL; + + dma_xfer->bounce_buf = no_free_ptr(bounce); + return no_free_ptr(dma_xfer); +} +EXPORT_SYMBOL_GPL(i3c_master_dma_map_single); + +/** + * i3c_master_dma_unmap_single() - Unmap buffer after DMA + * @dma_xfer: DMA transfer and mapping descriptor + * + * Unmap buffer and cleanup DMA transfer descriptor. + */ +void i3c_master_dma_unmap_single(struct i3c_dma *dma_xfer) +{ + dma_unmap_single(dma_xfer->dev, dma_xfer->addr, + dma_xfer->map_len, dma_xfer->dir); + if (dma_xfer->bounce_buf) { + if (dma_xfer->dir == DMA_FROM_DEVICE) + memcpy(dma_xfer->buf, dma_xfer->bounce_buf, + dma_xfer->len); + kfree(dma_xfer->bounce_buf); + } + kfree(dma_xfer); +} +EXPORT_SYMBOL_GPL(i3c_master_dma_unmap_single); + /** * i3c_master_set_info() - set master device information * @master: master used to send frames on the bus diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 043f5c7ff398..c52a82dd79a6 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -558,6 +558,26 @@ struct i3c_master_controller { #define i3c_bus_for_each_i3cdev(bus, dev) \ list_for_each_entry(dev, &(bus)->devs.i3c, common.node) +/** + * struct i3c_dma - DMA transfer and mapping descriptor + * @dev: device object of a device doing DMA + * @buf: destination/source buffer for DMA + * @len: length of transfer + * @map_len: length of DMA mapping + * @addr: mapped DMA address for a Host Controller Driver + * @dir: DMA direction + * @bounce_buf: an allocated bounce buffer if transfer needs it or NULL + */ +struct i3c_dma { + struct device *dev; + void *buf; + size_t len; + size_t map_len; + dma_addr_t addr; + enum dma_data_direction dir; + void *bounce_buf; +}; + int i3c_master_do_i2c_xfers(struct i3c_master_controller *master, const struct i2c_msg *xfers, int nxfers); @@ -575,6 +595,12 @@ int i3c_master_get_free_addr(struct i3c_master_controller *master, int i3c_master_add_i3c_dev_locked(struct i3c_master_controller *master, u8 addr); int i3c_master_do_daa(struct i3c_master_controller *master); +struct i3c_dma *i3c_master_dma_map_single(struct device *dev, void *ptr, + size_t len, bool force_bounce, + enum dma_data_direction dir); +void i3c_master_dma_unmap_single(struct i3c_dma *dma_xfer); +DEFINE_FREE(i3c_master_dma_unmap_single, void *, + if (_T) i3c_master_dma_unmap_single(_T)) int i3c_master_set_info(struct i3c_master_controller *master, const struct i3c_device_info *info); -- cgit v1.2.3 From 3baeae36039afc233d4a42d6ff4aa7019892619f Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 29 Aug 2025 16:10:57 +0300 Subject: PCI: Use pci_release_resource() instead of release_resource() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A few places in setup-bus.c call release_resource() directly and end up duplicating functionality from pci_release_resource() such as parent check, logging, and clearing the resource. Worse yet, the way the resource is cleared is inconsistent between different sites. Convert release_resource() calls into pci_release_resource() to remove code duplication. This will also make the resource start, end, and flags behavior consistent, i.e., start address is cleared, and only IORESOURCE_UNSET is asserted for the resource. While at it, eliminate the unnecessary initialization of idx variable in pci_bridge_release_resources(). Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20250829131113.36754-9-ilpo.jarvinen@linux.intel.com --- drivers/pci/setup-bus.c | 46 ++++++++++++++-------------------------------- drivers/pci/setup-res.c | 11 ++++++++--- include/linux/pci.h | 2 +- 3 files changed, 23 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 6bdc1af887da..b62465665abc 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -473,8 +473,6 @@ static void __assign_resources_sorted(struct list_head *head, struct pci_dev_resource *dev_res, *tmp_res, *dev_res2; struct resource *res; struct pci_dev *dev; - const char *res_name; - int idx; unsigned long fail_type; resource_size_t add_align, align; @@ -582,14 +580,7 @@ assign: res = dev_res->res; dev = dev_res->dev; - if (!res->parent) - continue; - - idx = pci_resource_num(dev, res); - res_name = pci_resource_name(dev, idx); - pci_dbg(dev, "%s %pR: releasing\n", res_name, res); - - release_resource(res); + pci_release_resource(dev, pci_resource_num(dev, res)); restore_dev_resource(dev_res); } /* Restore start/end/flags from saved list */ @@ -1732,7 +1723,7 @@ static void pci_bridge_release_resources(struct pci_bus *bus, struct resource *r; unsigned int old_flags; struct resource *b_res; - int idx = 1; + int idx, ret; b_res = &dev->resource[PCI_BRIDGE_RESOURCES]; @@ -1766,21 +1757,18 @@ static void pci_bridge_release_resources(struct pci_bus *bus, /* If there are children, release them all */ release_child_resources(r); - if (!release_resource(r)) { - type = old_flags = r->flags & PCI_RES_TYPE_MASK; - pci_info(dev, "resource %d %pR released\n", - PCI_BRIDGE_RESOURCES + idx, r); - /* Keep the old size */ - resource_set_range(r, 0, resource_size(r)); - r->flags = 0; - /* Avoiding touch the one without PREF */ - if (type & IORESOURCE_PREFETCH) - type = IORESOURCE_PREFETCH; - __pci_setup_bridge(bus, type); - /* For next child res under same bridge */ - r->flags = old_flags; - } + type = old_flags = r->flags & PCI_RES_TYPE_MASK; + ret = pci_release_resource(dev, PCI_BRIDGE_RESOURCES + idx); + if (ret) + return; + + /* Avoiding touch the one without PREF */ + if (type & IORESOURCE_PREFETCH) + type = IORESOURCE_PREFETCH; + __pci_setup_bridge(bus, type); + /* For next child res under same bridge */ + r->flags = old_flags; } enum release_type { @@ -2425,7 +2413,6 @@ int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type) for (i = PCI_BRIDGE_RESOURCES; i < PCI_BRIDGE_RESOURCE_END; i++) { struct resource *res = &bridge->resource[i]; - const char *res_name = pci_resource_name(bridge, i); if ((res->flags ^ type) & PCI_RES_TYPE_MASK) continue; @@ -2438,12 +2425,7 @@ int pci_reassign_bridge_resources(struct pci_dev *bridge, unsigned long type) if (ret) goto cleanup; - pci_info(bridge, "%s %pR: releasing\n", res_name, res); - - if (res->parent) - release_resource(res); - res->start = 0; - res->end = 0; + pci_release_resource(bridge, i); break; } if (i == PCI_BRIDGE_RESOURCE_END) diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index d2b3ed51e880..0468c058b598 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -406,20 +406,25 @@ int pci_reassign_resource(struct pci_dev *dev, int resno, return 0; } -void pci_release_resource(struct pci_dev *dev, int resno) +int pci_release_resource(struct pci_dev *dev, int resno) { struct resource *res = pci_resource_n(dev, resno); const char *res_name = pci_resource_name(dev, resno); + int ret; if (!res->parent) - return; + return 0; pci_info(dev, "%s %pR: releasing\n", res_name, res); - release_resource(res); + ret = release_resource(res); + if (ret) + return ret; res->end = resource_size(res) - 1; res->start = 0; res->flags |= IORESOURCE_UNSET; + + return 0; } EXPORT_SYMBOL(pci_release_resource); diff --git a/include/linux/pci.h b/include/linux/pci.h index 59876de13860..275df4058767 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1417,7 +1417,7 @@ void pci_reset_secondary_bus(struct pci_dev *dev); void pcibios_reset_secondary_bus(struct pci_dev *dev); void pci_update_resource(struct pci_dev *dev, int resno); int __must_check pci_assign_resource(struct pci_dev *dev, int i); -void pci_release_resource(struct pci_dev *dev, int resno); +int pci_release_resource(struct pci_dev *dev, int resno); static inline int pci_rebar_bytes_to_size(u64 bytes) { bytes = roundup_pow_of_two(bytes); -- cgit v1.2.3 From 4292a1e45fd464551efac7b2b52fd3606e956c28 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 29 Aug 2025 16:11:09 +0300 Subject: PCI: Refactor distributing available memory to use loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pci_bus_distribute_available_resources() and pci_bridge_distribute_available_resources() retain bridge window resources and related data needed for distributing the available window in independent variables for io, memory, and prefetchable memory windows. The code is essentially the same for all of them and therefore repeated three times with different variable names. Refactor pci_bus_distribute_available_resources() to take an array. This is complicated slightly by the function taking advantage of passing the struct as value, which cannot be done for arrays in C. Therefore, copy the data into a local array in the stack in the first loop. Variable names are (hopefully) improved slightly as well. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20250829131113.36754-21-ilpo.jarvinen@linux.intel.com --- drivers/pci/setup-bus.c | 162 +++++++++++++++++++++--------------------------- include/linux/pci.h | 3 +- 2 files changed, 74 insertions(+), 91 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 720159bca54d..3bc329b1b923 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -2059,15 +2059,16 @@ static void remove_dev_resource(struct resource *avail, struct pci_dev *dev, avail->start = min(avail->start + tmp, avail->end + 1); } -static void remove_dev_resources(struct pci_dev *dev, struct resource *io, - struct resource *mmio, - struct resource *mmio_pref) +static void remove_dev_resources(struct pci_dev *dev, + struct resource available[PCI_P2P_BRIDGE_RESOURCE_NUM]) { + struct resource *mmio_pref = &available[PCI_BUS_BRIDGE_PREF_MEM_WINDOW]; struct resource *res; pci_dev_for_each_resource(dev, res) { if (resource_type(res) == IORESOURCE_IO) { - remove_dev_resource(io, dev, res); + remove_dev_resource(&available[PCI_BUS_BRIDGE_IO_WINDOW], + dev, res); } else if (resource_type(res) == IORESOURCE_MEM) { /* @@ -2081,10 +2082,13 @@ static void remove_dev_resources(struct pci_dev *dev, struct resource *io, */ if ((res->flags & IORESOURCE_PREFETCH) && ((res->flags & IORESOURCE_MEM_64) == - (mmio_pref->flags & IORESOURCE_MEM_64))) - remove_dev_resource(mmio_pref, dev, res); - else - remove_dev_resource(mmio, dev, res); + (mmio_pref->flags & IORESOURCE_MEM_64))) { + remove_dev_resource(&available[PCI_BUS_BRIDGE_PREF_MEM_WINDOW], + dev, res); + } else { + remove_dev_resource(&available[PCI_BUS_BRIDGE_MEM_WINDOW], + dev, res); + } } } } @@ -2099,45 +2103,39 @@ static void remove_dev_resources(struct pci_dev *dev, struct resource *io, * shared with the bridges. */ static void pci_bus_distribute_available_resources(struct pci_bus *bus, - struct list_head *add_list, - struct resource io, - struct resource mmio, - struct resource mmio_pref) + struct list_head *add_list, + struct resource available_in[PCI_P2P_BRIDGE_RESOURCE_NUM]) { + struct resource available[PCI_P2P_BRIDGE_RESOURCE_NUM]; unsigned int normal_bridges = 0, hotplug_bridges = 0; - struct resource *io_res, *mmio_res, *mmio_pref_res; struct pci_dev *dev, *bridge = bus->self; - resource_size_t io_per_b, mmio_per_b, mmio_pref_per_b, align; + resource_size_t per_bridge[PCI_P2P_BRIDGE_RESOURCE_NUM]; + resource_size_t align; + int i; - io_res = &bridge->resource[PCI_BRIDGE_IO_WINDOW]; - mmio_res = &bridge->resource[PCI_BRIDGE_MEM_WINDOW]; - mmio_pref_res = &bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; + for (i = 0; i < PCI_P2P_BRIDGE_RESOURCE_NUM; i++) { + struct resource *res = pci_bus_resource_n(bus, i); - /* - * The alignment of this bridge is yet to be considered, hence it must - * be done now before extending its bridge window. - */ - align = pci_resource_alignment(bridge, io_res); - if (!io_res->parent && align) - io.start = min(ALIGN(io.start, align), io.end + 1); - - align = pci_resource_alignment(bridge, mmio_res); - if (!mmio_res->parent && align) - mmio.start = min(ALIGN(mmio.start, align), mmio.end + 1); + available[i] = available_in[i]; - align = pci_resource_alignment(bridge, mmio_pref_res); - if (!mmio_pref_res->parent && align) - mmio_pref.start = min(ALIGN(mmio_pref.start, align), - mmio_pref.end + 1); + /* + * The alignment of this bridge is yet to be considered, + * hence it must be done now before extending its bridge + * window. + */ + align = pci_resource_alignment(bridge, res); + if (!res->parent && align) + available[i].start = min(ALIGN(available[i].start, align), + available[i].end + 1); - /* - * Now that we have adjusted for alignment, update the bridge window - * resources to fill as much remaining resource space as possible. - */ - adjust_bridge_window(bridge, io_res, add_list, resource_size(&io)); - adjust_bridge_window(bridge, mmio_res, add_list, resource_size(&mmio)); - adjust_bridge_window(bridge, mmio_pref_res, add_list, - resource_size(&mmio_pref)); + /* + * Now that we have adjusted for alignment, update the + * bridge window resources to fill as much remaining + * resource space as possible. + */ + adjust_bridge_window(bridge, res, add_list, + resource_size(&available[i])); + } /* * Calculate how many hotplug bridges and normal bridges there @@ -2161,7 +2159,7 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, */ list_for_each_entry(dev, &bus->devices, bus_list) { if (!dev->is_virtfn) - remove_dev_resources(dev, &io, &mmio, &mmio_pref); + remove_dev_resources(dev, available); } /* @@ -2173,16 +2171,9 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, * split between non-hotplug bridges. This is to allow possible * hotplug bridges below them to get the extra space as well. */ - if (hotplug_bridges) { - io_per_b = div64_ul(resource_size(&io), hotplug_bridges); - mmio_per_b = div64_ul(resource_size(&mmio), hotplug_bridges); - mmio_pref_per_b = div64_ul(resource_size(&mmio_pref), - hotplug_bridges); - } else { - io_per_b = div64_ul(resource_size(&io), normal_bridges); - mmio_per_b = div64_ul(resource_size(&mmio), normal_bridges); - mmio_pref_per_b = div64_ul(resource_size(&mmio_pref), - normal_bridges); + for (i = 0; i < PCI_P2P_BRIDGE_RESOURCE_NUM; i++) { + per_bridge[i] = div64_ul(resource_size(&available[i]), + hotplug_bridges ?: normal_bridges); } for_each_pci_bridge(dev, bus) { @@ -2195,49 +2186,41 @@ static void pci_bus_distribute_available_resources(struct pci_bus *bus, if (hotplug_bridges && !dev->is_hotplug_bridge) continue; - res = &dev->resource[PCI_BRIDGE_IO_WINDOW]; + for (i = 0; i < PCI_P2P_BRIDGE_RESOURCE_NUM; i++) { + res = pci_bus_resource_n(bus, i); - /* - * Make sure the split resource space is properly aligned - * for bridge windows (align it down to avoid going above - * what is available). - */ - align = pci_resource_alignment(dev, res); - resource_set_size(&io, ALIGN_DOWN_IF_NONZERO(io_per_b, align)); - - /* - * The x_per_b holds the extra resource space that can be - * added for each bridge but there is the minimal already - * reserved as well so adjust x.start down accordingly to - * cover the whole space. - */ - io.start -= resource_size(res); - - res = &dev->resource[PCI_BRIDGE_MEM_WINDOW]; - align = pci_resource_alignment(dev, res); - resource_set_size(&mmio, - ALIGN_DOWN_IF_NONZERO(mmio_per_b,align)); - mmio.start -= resource_size(res); + /* + * Make sure the split resource space is properly + * aligned for bridge windows (align it down to + * avoid going above what is available). + */ + align = pci_resource_alignment(dev, res); + resource_set_size(&available[i], + ALIGN_DOWN_IF_NONZERO(per_bridge[i], + align)); - res = &dev->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; - align = pci_resource_alignment(dev, res); - resource_set_size(&mmio_pref, - ALIGN_DOWN_IF_NONZERO(mmio_pref_per_b, align)); - mmio_pref.start -= resource_size(res); + /* + * The per_bridge holds the extra resource space + * that can be added for each bridge but there is + * the minimal already reserved as well so adjust + * x.start down accordingly to cover the whole + * space. + */ + available[i].start -= resource_size(res); + } - pci_bus_distribute_available_resources(b, add_list, io, mmio, - mmio_pref); + pci_bus_distribute_available_resources(b, add_list, available); - io.start += io.end + 1; - mmio.start += mmio.end + 1; - mmio_pref.start += mmio_pref.end + 1; + for (i = 0; i < PCI_P2P_BRIDGE_RESOURCE_NUM; i++) + available[i].start += available[i].end + 1; } } static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, struct list_head *add_list) { - struct resource available_io, available_mmio, available_mmio_pref; + struct resource *res, available[PCI_P2P_BRIDGE_RESOURCE_NUM]; + unsigned int i; if (!bridge->is_hotplug_bridge) return; @@ -2245,14 +2228,13 @@ static void pci_bridge_distribute_available_resources(struct pci_dev *bridge, pci_dbg(bridge, "distributing available resources\n"); /* Take the initial extra resources from the hotplug port */ - available_io = bridge->resource[PCI_BRIDGE_IO_WINDOW]; - available_mmio = bridge->resource[PCI_BRIDGE_MEM_WINDOW]; - available_mmio_pref = bridge->resource[PCI_BRIDGE_PREF_MEM_WINDOW]; + for (i = 0; i < PCI_P2P_BRIDGE_RESOURCE_NUM; i++) { + res = pci_resource_n(bridge, PCI_BRIDGE_RESOURCES + i); + available[i] = *res; + } pci_bus_distribute_available_resources(bridge->subordinate, - add_list, available_io, - available_mmio, - available_mmio_pref); + add_list, available); } static bool pci_bridge_resources_not_assigned(struct pci_dev *dev) diff --git a/include/linux/pci.h b/include/linux/pci.h index 275df4058767..723e9cede69d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -119,7 +119,8 @@ enum { #define PCI_CB_BRIDGE_MEM_1_WINDOW (PCI_BRIDGE_RESOURCES + 3) /* Total number of bridge resources for P2P and CardBus */ -#define PCI_BRIDGE_RESOURCE_NUM 4 +#define PCI_P2P_BRIDGE_RESOURCE_NUM 3 +#define PCI_BRIDGE_RESOURCE_NUM 4 /* Resources assigned to buses behind the bridge */ PCI_BRIDGE_RESOURCES, -- cgit v1.2.3 From dadb3ebcf395ebee3626d88ac7e5e234f15bae2c Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Sun, 14 Sep 2025 15:44:26 +0200 Subject: workqueue: WQ_PERCPU added to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. alloc_workqueue() treats all queues as per-CPU by default, while unbound workqueues must opt-in via WQ_UNBOUND. This default is suboptimal: most workloads benefit from unbound queues, allowing the scheduler to place worker threads where they’re needed and reducing noise when CPUs are isolated. This patch adds a new WQ_PERCPU flag to explicitly request the use of the per-CPU behavior. Both flags coexist for one release cycle to allow callers to transition their calls. Once migration is complete, WQ_UNBOUND can be removed and unbound will become the implicit default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. All existing users have been updated accordingly. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 4 ++-- kernel/workqueue.c | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index b6834b7aee4b..71a9900c03c7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -410,7 +410,7 @@ enum wq_flags { __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ /* BH wq only allows the following flags */ - __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, + __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI | WQ_PERCPU, }; enum wq_consts { @@ -570,7 +570,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ - alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) + alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 90db8cf015c2..45320e27a16c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7828,22 +7828,22 @@ void __init workqueue_init_early(void) ordered_wq_attrs[i] = attrs; } - system_wq = alloc_workqueue("events", 0, 0); - system_percpu_wq = alloc_workqueue("events", 0, 0); - system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); - system_long_wq = alloc_workqueue("events_long", 0, 0); + system_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_percpu_wq = alloc_workqueue("events", WQ_PERCPU, 0); + system_highpri_wq = alloc_workqueue("events_highpri", + WQ_HIGHPRI | WQ_PERCPU, 0); + system_long_wq = alloc_workqueue("events_long", WQ_PERCPU, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", - WQ_FREEZABLE, 0); + WQ_FREEZABLE | WQ_PERCPU, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", - WQ_POWER_EFFICIENT, 0); + WQ_POWER_EFFICIENT | WQ_PERCPU, 0); system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient", - WQ_FREEZABLE | WQ_POWER_EFFICIENT, - 0); - system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); + WQ_FREEZABLE | WQ_POWER_EFFICIENT | WQ_PERCPU, 0); + system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", - WQ_BH | WQ_HIGHPRI, 0); + WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0); BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || -- cgit v1.2.3 From 6b4be64fd9fec16418f365c2d8e47a7566e9eba5 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 15 Sep 2025 15:24:32 +0300 Subject: net/mlx5e: Harden uplink netdev access against device unbind The function mlx5_uplink_netdev_get() gets the uplink netdevice pointer from mdev->mlx5e_res.uplink_netdev. However, the netdevice can be removed and its pointer cleared when unbound from the mlx5_core.eth driver. This results in a NULL pointer, causing a kernel panic. BUG: unable to handle page fault for address: 0000000000001300 at RIP: 0010:mlx5e_vport_rep_load+0x22a/0x270 [mlx5_core] Call Trace: mlx5_esw_offloads_rep_load+0x68/0xe0 [mlx5_core] esw_offloads_enable+0x593/0x910 [mlx5_core] mlx5_eswitch_enable_locked+0x341/0x420 [mlx5_core] mlx5_devlink_eswitch_mode_set+0x17e/0x3a0 [mlx5_core] devlink_nl_eswitch_set_doit+0x60/0xd0 genl_family_rcv_msg_doit+0xe0/0x130 genl_rcv_msg+0x183/0x290 netlink_rcv_skb+0x4b/0xf0 genl_rcv+0x24/0x40 netlink_unicast+0x255/0x380 netlink_sendmsg+0x1f3/0x420 __sock_sendmsg+0x38/0x60 __sys_sendto+0x119/0x180 do_syscall_64+0x53/0x1d0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Ensure the pointer is valid before use by checking it for NULL. If it is valid, immediately call netdev_hold() to take a reference, and preventing the netdevice from being freed while it is in use. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Reviewed-by: Jiri Pirko Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757939074-617281-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 27 ++++++++++++++++++---- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h | 15 +++++++++++- include/linux/mlx5/driver.h | 1 + 4 files changed, 38 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 63a7a788fb0d..cd0242eb008c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1506,12 +1506,21 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = { static int mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) { - struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); + struct net_device *netdev; + struct mlx5e_priv *priv; + int err; + + netdev = mlx5_uplink_netdev_get(dev); + if (!netdev) + return 0; + priv = netdev_priv(netdev); rpriv->netdev = priv->netdev; - return mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, - rpriv); + err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, + rpriv); + mlx5_uplink_netdev_put(dev, netdev); + return err; } static void @@ -1638,8 +1647,16 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) { struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); struct net_device *netdev = rpriv->netdev; - struct mlx5e_priv *priv = netdev_priv(netdev); - void *ppriv = priv->ppriv; + struct mlx5e_priv *priv; + void *ppriv; + + if (!netdev) { + ppriv = rpriv; + goto free_ppriv; + } + + priv = netdev_priv(netdev); + ppriv = priv->ppriv; if (rep->vport == MLX5_VPORT_UPLINK) { mlx5e_vport_uplink_rep_unload(rpriv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 8b4977650183..5f2d6c35f1ad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1515,6 +1515,7 @@ static u32 mlx5_esw_qos_lag_link_speed_get_locked(struct mlx5_core_dev *mdev) speed = lksettings.base.speed; out: + mlx5_uplink_netdev_put(mdev, slave); return speed; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index b111ccd03b02..74ea5da58b7e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -47,7 +47,20 @@ int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data); static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) { - return mdev->mlx5e_res.uplink_netdev; + struct mlx5e_resources *mlx5e_res = &mdev->mlx5e_res; + struct net_device *netdev; + + mutex_lock(&mlx5e_res->uplink_netdev_lock); + netdev = mlx5e_res->uplink_netdev; + netdev_hold(netdev, &mlx5e_res->tracker, GFP_KERNEL); + mutex_unlock(&mlx5e_res->uplink_netdev_lock); + return netdev; +} + +static inline void mlx5_uplink_netdev_put(struct mlx5_core_dev *mdev, + struct net_device *netdev) +{ + netdev_put(netdev, &mdev->mlx5e_res.tracker); } struct mlx5_sd; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8c5fbfb85749..10fe492e1fed 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -663,6 +663,7 @@ struct mlx5e_resources { bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; + netdevice_tracker tracker; struct mutex uplink_netdev_lock; struct mlx5_crypto_dek_priv *dek_priv; }; -- cgit v1.2.3 From ea6bb47fd6a4c5a332f9349c39bf7462e3e7a35b Mon Sep 17 00:00:00 2001 From: Alan Borzeszkowski Date: Wed, 27 Aug 2025 13:56:46 +0200 Subject: thunderbolt: Update thunderbolt.h header file Make Thunderbolt header file compliant with current kernel-doc standards. No functional changes. Signed-off-by: Alan Borzeszkowski Signed-off-by: Mika Westerberg --- include/linux/thunderbolt.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 75247486616b..0ba112175bb3 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -213,7 +213,7 @@ enum tb_link_width { * queried first * @service_ids: Used to generate IDs for the services * @in_hopids: Input HopIDs for DMA tunneling - * @out_hopids; Output HopIDs for DMA tunneling + * @out_hopids: Output HopIDs for DMA tunneling * @local_property_block: Local block of properties * @local_property_block_gen: Generation of @local_property_block * @local_property_block_len: Length of the @local_property_block in dwords @@ -356,7 +356,7 @@ int tb_xdomain_request(struct tb_xdomain *xd, const void *request, unsigned int timeout_msec); /** - * tb_protocol_handler - Protocol specific handler + * struct tb_protocol_handler - Protocol specific handler * @uuid: XDomain messages with this UUID are dispatched to this handler * @callback: Callback called with the XDomain message. Returning %1 * here tells the XDomain core that the message was handled @@ -437,7 +437,7 @@ static inline struct tb_service *tb_to_service(struct device *dev) } /** - * tb_service_driver - Thunderbolt service driver + * struct tb_service_driver - Thunderbolt service driver * @driver: Driver structure * @probe: Called when the driver is probed * @remove: Called when the driver is removed (optional) @@ -519,6 +519,7 @@ struct tb_nhi { * @head: Head of the ring (write next descriptor here) * @tail: Tail of the ring (complete next descriptor here) * @descriptors: Allocated descriptors for this ring + * @descriptors_dma: DMA address of descriptors for this ring * @queue: Queue holding frames to be transferred over this ring * @in_flight: Queue holding frames that are currently in flight * @work: Interrupt work structure @@ -571,12 +572,12 @@ typedef void (*ring_cb)(struct tb_ring *, struct ring_frame *, bool canceled); /** * enum ring_desc_flags - Flags for DMA ring descriptor - * %RING_DESC_ISOCH: Enable isonchronous DMA (Tx only) - * %RING_DESC_CRC_ERROR: In frame mode CRC check failed for the frame (Rx only) - * %RING_DESC_COMPLETED: Descriptor completed (set by NHI) - * %RING_DESC_POSTED: Always set this - * %RING_DESC_BUFFER_OVERRUN: RX buffer overrun - * %RING_DESC_INTERRUPT: Request an interrupt on completion + * @RING_DESC_ISOCH: Enable isonchronous DMA (Tx only) + * @RING_DESC_CRC_ERROR: In frame mode CRC check failed for the frame (Rx only) + * @RING_DESC_COMPLETED: Descriptor completed (set by NHI) + * @RING_DESC_POSTED: Always set this + * @RING_DESC_BUFFER_OVERRUN: RX buffer overrun + * @RING_DESC_INTERRUPT: Request an interrupt on completion */ enum ring_desc_flags { RING_DESC_ISOCH = 0x1, @@ -636,7 +637,7 @@ int __tb_ring_enqueue(struct tb_ring *ring, struct ring_frame *frame); * If ring_stop() is called after the packet has been enqueued * @frame->callback will be called with canceled set to true. * - * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise. + * Return: %-ESHUTDOWN if ring_stop() has been called, %0 otherwise. */ static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame) { @@ -657,7 +658,7 @@ static inline int tb_ring_rx(struct tb_ring *ring, struct ring_frame *frame) * If ring_stop() is called after the packet has been enqueued @frame->callback * will be called with canceled set to true. * - * Return: Returns %-ESHUTDOWN if ring_stop has been called. Zero otherwise. + * Return: %-ESHUTDOWN if ring_stop has been called, %0 otherwise. */ static inline int tb_ring_tx(struct tb_ring *ring, struct ring_frame *frame) { @@ -675,6 +676,8 @@ void tb_ring_poll_complete(struct tb_ring *ring); * * Use this function when you are mapping DMA for buffers that are * passed to the ring for sending/receiving. + * + * Return: Pointer to device used for DMA mapping. */ static inline struct device *tb_ring_dma_device(struct tb_ring *ring) { -- cgit v1.2.3 From de2be98541dbe0de58d2dccf7fa19dfc9d9a8260 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Sep 2025 10:10:17 +0300 Subject: net/mlx5: Remove VLAN insertion fields from WQE Ether segment Now that the driver no longer uses VLAN TX insertion via the WQE Ethernet segment, the related fields and flags can be removed. Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757574619-604874-2-git-send-email-tariqt@nvidia.com Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky --- include/linux/mlx5/qp.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index fc7eeff99a8a..5546c7bd2c83 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -237,13 +237,11 @@ enum { }; enum { - MLX5_ETH_WQE_SVLAN = 1 << 0, MLX5_ETH_WQE_TRAILER_HDR_OUTER_IP_ASSOC = 1 << 26, MLX5_ETH_WQE_TRAILER_HDR_OUTER_L4_ASSOC = 1 << 27, MLX5_ETH_WQE_TRAILER_HDR_INNER_IP_ASSOC = 3 << 26, MLX5_ETH_WQE_TRAILER_HDR_INNER_L4_ASSOC = 1 << 28, MLX5_ETH_WQE_INSERT_TRAILER = 1 << 30, - MLX5_ETH_WQE_INSERT_VLAN = 1 << 15, }; enum { @@ -275,10 +273,6 @@ struct mlx5_wqe_eth_seg { DECLARE_FLEX_ARRAY(u8, data); }; } inline_hdr; - struct { - __be16 type; - __be16 vlan_tci; - } insert; __be32 trailer; }; }; -- cgit v1.2.3 From cce65f32443b61db2370a67d2e92d16b773fe8a4 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Sep 2025 10:10:18 +0300 Subject: net/mlx5: Refactor MACsec WQE metadata shifts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce MLX5_ETH_WQE_FT_META_SHIFT as a shared base offset for features that use the lower 8 bits of the WQE flow_table_metadata field, currently used for timestamping, IPsec, and MACsec. Define MLX5_ETH_WQE_FT_META_MACSEC_FS_ID_MASK so that fs_id occupies bits 2–5, making it clear that fs_id occupies bits in the metadata. Set MLX5_ETH_WQE_FT_META_MACSEC_MASK as the OR of the MACsec flag and MLX5_ETH_WQE_FT_META_MACSEC_FS_ID_MASK, corresponding to the original 0x3E mask. Update the fs_id macro to right-shift the MACsec flag by MLX5_ETH_WQE_FT_META_SHIFT and update the RoCE modify-header action to use it. Introduce the helper macro MLX5_MACSEC_TX_METADATA(fs_id) to compose the full shifted MACsec metadata value. These changes make it explicit exactly which metadata bits carry MACsec information, simplifying future feature exclusions when multiple features share the WQE flowtable metadata. In addition, drop the incorrect “RX flow steering” comment, since this applies to TX flow steering. Signed-off-by: Carolina Jubran Reviewed-by: Jianbo Liu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757574619-604874-3-git-send-email-tariqt@nvidia.com Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c | 12 +++++------- drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h | 15 +++++++++++++++ include/linux/mlx5/qp.h | 9 +++++++-- 4 files changed, 28 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c index 6ab02f3fc291..528b04d4de41 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c @@ -1676,7 +1676,7 @@ void mlx5e_macsec_tx_build_eseg(struct mlx5e_macsec *macsec, if (!fs_id) return; - eseg->flow_table_metadata = cpu_to_be32(MLX5_ETH_WQE_FT_META_MACSEC | fs_id << 2); + eseg->flow_table_metadata = cpu_to_be32(MLX5_MACSEC_TX_METADATA(fs_id)); } void mlx5e_macsec_offload_handle_rx_skb(struct net_device *netdev, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c index 762d55ba9e51..9ec450603176 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c @@ -45,11 +45,7 @@ #define MLX5_SECTAG_HEADER_SIZE_WITHOUT_SCI 0x8 #define MLX5_SECTAG_HEADER_SIZE_WITH_SCI (MLX5_SECTAG_HEADER_SIZE_WITHOUT_SCI + MACSEC_SCI_LEN) -/* MACsec RX flow steering */ -#define MLX5_ETH_WQE_FT_META_MACSEC_MASK 0x3E - /* MACsec fs_id handling for steering */ -#define macsec_fs_set_tx_fs_id(fs_id) (MLX5_ETH_WQE_FT_META_MACSEC | (fs_id) << 2) #define macsec_fs_set_rx_fs_id(fs_id) ((fs_id) | BIT(30)) struct mlx5_sectag_header { @@ -597,7 +593,7 @@ static int macsec_fs_tx_setup_fte(struct mlx5_macsec_fs *macsec_fs, MLX5_SET(fte_match_param, spec->match_criteria, misc_parameters_2.metadata_reg_a, MLX5_ETH_WQE_FT_META_MACSEC_MASK); MLX5_SET(fte_match_param, spec->match_value, misc_parameters_2.metadata_reg_a, - macsec_fs_set_tx_fs_id(id)); + MLX5_MACSEC_TX_METADATA(id)); *fs_id = id; flow_act->crypto.type = MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC; @@ -2219,8 +2215,10 @@ static int mlx5_macsec_fs_add_roce_rule_tx(struct mlx5_macsec_fs *macsec_fs, u32 MLX5_SET(set_action_in, action, action_type, MLX5_ACTION_TYPE_SET); MLX5_SET(set_action_in, action, field, MLX5_ACTION_IN_FIELD_METADATA_REG_A); - MLX5_SET(set_action_in, action, data, macsec_fs_set_tx_fs_id(fs_id)); - MLX5_SET(set_action_in, action, offset, 0); + MLX5_SET(set_action_in, action, data, + mlx5_macsec_fs_set_tx_fs_id(fs_id)); + MLX5_SET(set_action_in, action, offset, + MLX5_ETH_WQE_FT_META_MACSEC_SHIFT); MLX5_SET(set_action_in, action, length, 32); modify_hdr = mlx5_modify_header_alloc(mdev, MLX5_FLOW_NAMESPACE_RDMA_TX_MACSEC, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h index 34b80c3ef6a5..15acaff43641 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.h @@ -12,6 +12,21 @@ #define MLX5_MACSEC_METADATA_MARKER(metadata) ((((metadata) >> 30) & 0x3) == 0x1) #define MLX5_MACSEC_RX_METADAT_HANDLE(metadata) ((metadata) & MLX5_MACSEC_RX_FS_ID_MASK) +/* MACsec TX flow steering */ +#define MLX5_ETH_WQE_FT_META_MACSEC_MASK \ + (MLX5_ETH_WQE_FT_META_MACSEC | MLX5_ETH_WQE_FT_META_MACSEC_FS_ID_MASK) +#define MLX5_ETH_WQE_FT_META_MACSEC_SHIFT MLX5_ETH_WQE_FT_META_SHIFT + +/* MACsec fs_id handling for steering */ +#define mlx5_macsec_fs_set_tx_fs_id(fs_id) \ + (((MLX5_ETH_WQE_FT_META_MACSEC) >> MLX5_ETH_WQE_FT_META_MACSEC_SHIFT) \ + | ((fs_id) << 2)) + +#define MLX5_MACSEC_TX_METADATA(fs_id) \ + (mlx5_macsec_fs_set_tx_fs_id(fs_id) << \ + MLX5_ETH_WQE_FT_META_MACSEC_SHIFT) + +/* MACsec fs_id uses 4 bits, supports up to 16 interfaces */ #define MLX5_MACSEC_NUM_OF_SUPPORTED_INTERFACES 16 struct mlx5_macsec_fs; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 5546c7bd2c83..b21be7630575 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -251,9 +251,14 @@ enum { MLX5_ETH_WQE_SWP_OUTER_L4_UDP = 1 << 5, }; +/* Base shift for metadata bits used by timestamping, IPsec, and MACsec */ +#define MLX5_ETH_WQE_FT_META_SHIFT 0 + enum { - MLX5_ETH_WQE_FT_META_IPSEC = BIT(0), - MLX5_ETH_WQE_FT_META_MACSEC = BIT(1), + MLX5_ETH_WQE_FT_META_IPSEC = BIT(0) << MLX5_ETH_WQE_FT_META_SHIFT, + MLX5_ETH_WQE_FT_META_MACSEC = BIT(1) << MLX5_ETH_WQE_FT_META_SHIFT, + MLX5_ETH_WQE_FT_META_MACSEC_FS_ID_MASK = + GENMASK(5, 2) << MLX5_ETH_WQE_FT_META_SHIFT, }; struct mlx5_wqe_eth_seg { -- cgit v1.2.3 From 2ac207381c37eebc49559634ce5642119784bc7c Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 11 Sep 2025 10:10:19 +0300 Subject: net/mlx5e: Prevent WQE metadata conflicts between timestamping and offloads Update the WQE metadata assignment to avoid overriding existing metadata when setting the sysport timestamp ID. Since timestamp IDs are limited to 256 values, they use only the lower 8 bits of the metadata field. To avoid conflicts, move IPsec and MACsec metadata ID to bits 8 and 9, and shift the MACsec fs_id accordingly. This ensures safe coexistence of timestamping and offload features that use the same metadata field. Signed-off-by: Carolina Jubran Reviewed-by: Jianbo Liu Reviewed-by: Patrisious Haddad Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1757574619-604874-4-git-send-email-tariqt@nvidia.com Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c | 2 +- include/linux/mlx5/qp.h | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index 319061d31602..6c55b67b7335 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -653,7 +653,7 @@ static void mlx5e_cqe_ts_id_eseg(struct mlx5e_ptpsq *ptpsq, struct sk_buff *skb, struct mlx5_wqe_eth_seg *eseg) { if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) - eseg->flow_table_metadata = + eseg->flow_table_metadata |= cpu_to_be32(mlx5e_ptp_metadata_fifo_peek(&ptpsq->metadata_freelist)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c index 9ec450603176..e6be2f01daf4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/macsec_fs.c @@ -2219,7 +2219,7 @@ static int mlx5_macsec_fs_add_roce_rule_tx(struct mlx5_macsec_fs *macsec_fs, u32 mlx5_macsec_fs_set_tx_fs_id(fs_id)); MLX5_SET(set_action_in, action, offset, MLX5_ETH_WQE_FT_META_MACSEC_SHIFT); - MLX5_SET(set_action_in, action, length, 32); + MLX5_SET(set_action_in, action, length, 8); modify_hdr = mlx5_modify_header_alloc(mdev, MLX5_FLOW_NAMESPACE_RDMA_TX_MACSEC, 1, action); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index b21be7630575..d67aedc6ea68 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -251,8 +251,9 @@ enum { MLX5_ETH_WQE_SWP_OUTER_L4_UDP = 1 << 5, }; -/* Base shift for metadata bits used by timestamping, IPsec, and MACsec */ -#define MLX5_ETH_WQE_FT_META_SHIFT 0 +/* Metadata bits 0-7 are used by timestamping */ +/* Base shift for metadata bits used by IPsec and MACsec */ +#define MLX5_ETH_WQE_FT_META_SHIFT 8 enum { MLX5_ETH_WQE_FT_META_IPSEC = BIT(0) << MLX5_ETH_WQE_FT_META_SHIFT, -- cgit v1.2.3 From 0b1619c38600fc06c73b1f59c64af0b7df08fc2c Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 15 Sep 2025 11:10:07 +0200 Subject: gpio: nomadik: fix the debugfs helper stub Commit ddeb66d2cb10 ("gpio: nomadik: don't print out global GPIO numbers in debugfs callbacks") failed to also update the stub of the debugfs helper for !CONFIG_DEBUG_FS. Fix the resulting build failure. Fixes: ddeb66d2cb10 ("gpio: nomadik: don't print out global GPIO numbers in debugfs callbacks") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509132232.12viPUPB-lkp@intel.com/ Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20250915091007.28438-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/gpio-nomadik.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/gpio-nomadik.h b/include/linux/gpio/gpio-nomadik.h index 7ba53b499e16..592a774a53cd 100644 --- a/include/linux/gpio/gpio-nomadik.h +++ b/include/linux/gpio/gpio-nomadik.h @@ -268,8 +268,7 @@ void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, static inline void nmk_gpio_dbg_show_one(struct seq_file *s, struct pinctrl_dev *pctldev, struct gpio_chip *chip, - unsigned int offset, - unsigned int gpio) + unsigned int offset) { } -- cgit v1.2.3 From d1dd75c6500c74b91c5286fd3277710371d3e3ca Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Sat, 13 Sep 2025 16:12:54 +0000 Subject: HID: core: Change hid_driver to use a const char* for name name is never mutated by the core HID stack. Making name a const char* simplifies passing the string from Rust to C. Otherwise, it becomes difficult to pass a 'static lifetime CStr from Rust to a char*, rather than a const char*, due to lack of guarantee that the underlying data of the CStr will not be mutated by the C code. Signed-off-by: Rahul Rameshbabu Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 2cc4f1e4ea96..426b22ed42b4 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -816,7 +816,7 @@ struct hid_usage_id { * zero from them. */ struct hid_driver { - char *name; + const char *name; const struct hid_device_id *id_table; struct list_head dyn_list; -- cgit v1.2.3 From 648dbccc03a000cd64c2a9d86012d98053545e64 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 16 Sep 2025 21:29:49 +0000 Subject: crypto: ccp - Add AMD Seamless Firmware Servicing (SFS) driver AMD Seamless Firmware Servicing (SFS) is a secure method to allow non-persistent updates to running firmware and settings without requiring BIOS reflash and/or system reset. SFS does not address anything that runs on the x86 processors and it can be used to update ASP firmware, modules, register settings and update firmware for other microprocessors like TMPM, etc. SFS driver support adds ioctl support to communicate the SFS commands to the ASP/PSP by using the TEE mailbox interface. The Seamless Firmware Servicing (SFS) driver is added as a PSP sub-device. For detailed information, please look at the SFS specifications: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/58604.pdf Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Herbert Xu Link: https://lore.kernel.org/cover.1758057691.git.ashish.kalra@amd.com --- drivers/crypto/ccp/Makefile | 3 +- drivers/crypto/ccp/psp-dev.c | 20 +++ drivers/crypto/ccp/psp-dev.h | 8 +- drivers/crypto/ccp/sfs.c | 311 ++++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/sfs.h | 47 ++++++ include/linux/psp-platform-access.h | 2 + include/uapi/linux/psp-sfs.h | 87 ++++++++++ 7 files changed, 476 insertions(+), 2 deletions(-) create mode 100644 drivers/crypto/ccp/sfs.c create mode 100644 drivers/crypto/ccp/sfs.h create mode 100644 include/uapi/linux/psp-sfs.h (limited to 'include/linux') diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 394484929dae..a9626b30044a 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -13,7 +13,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ tee-dev.o \ platform-access.o \ dbc.o \ - hsti.o + hsti.o \ + sfs.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 1c5a7189631e..9e21da0e298a 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -17,6 +17,7 @@ #include "psp-dev.h" #include "sev-dev.h" #include "tee-dev.h" +#include "sfs.h" #include "platform-access.h" #include "dbc.h" #include "hsti.h" @@ -182,6 +183,17 @@ static int psp_check_tee_support(struct psp_device *psp) return 0; } +static int psp_check_sfs_support(struct psp_device *psp) +{ + /* Check if device supports SFS feature */ + if (!psp->capability.sfs) { + dev_dbg(psp->dev, "psp does not support SFS\n"); + return -ENODEV; + } + + return 0; +} + static int psp_init(struct psp_device *psp) { int ret; @@ -198,6 +210,12 @@ static int psp_init(struct psp_device *psp) return ret; } + if (!psp_check_sfs_support(psp)) { + ret = sfs_dev_init(psp); + if (ret) + return ret; + } + if (psp->vdata->platform_access) { ret = platform_access_dev_init(psp); if (ret) @@ -302,6 +320,8 @@ void psp_dev_destroy(struct sp_device *sp) tee_dev_destroy(psp); + sfs_dev_destroy(psp); + dbc_dev_destroy(psp); platform_access_dev_destroy(psp); diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index e43ce87ede76..268c83f298cb 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -32,7 +32,8 @@ union psp_cap_register { unsigned int sev :1, tee :1, dbc_thru_ext :1, - rsvd1 :4, + sfs :1, + rsvd1 :3, security_reporting :1, fused_part :1, rsvd2 :1, @@ -68,6 +69,7 @@ struct psp_device { void *tee_data; void *platform_access_data; void *dbc_data; + void *sfs_data; union psp_cap_register capability; }; @@ -118,12 +120,16 @@ struct psp_ext_request { * @PSP_SUB_CMD_DBC_SET_UID: Set UID for DBC * @PSP_SUB_CMD_DBC_GET_PARAMETER: Get parameter from DBC * @PSP_SUB_CMD_DBC_SET_PARAMETER: Set parameter for DBC + * @PSP_SUB_CMD_SFS_GET_FW_VERS: Get firmware versions for ASP and other MP + * @PSP_SUB_CMD_SFS_UPDATE: Command to load, verify and execute SFS package */ enum psp_sub_cmd { PSP_SUB_CMD_DBC_GET_NONCE = PSP_DYNAMIC_BOOST_GET_NONCE, PSP_SUB_CMD_DBC_SET_UID = PSP_DYNAMIC_BOOST_SET_UID, PSP_SUB_CMD_DBC_GET_PARAMETER = PSP_DYNAMIC_BOOST_GET_PARAMETER, PSP_SUB_CMD_DBC_SET_PARAMETER = PSP_DYNAMIC_BOOST_SET_PARAMETER, + PSP_SUB_CMD_SFS_GET_FW_VERS = PSP_SFS_GET_FW_VERSIONS, + PSP_SUB_CMD_SFS_UPDATE = PSP_SFS_UPDATE, }; int psp_extended_mailbox_cmd(struct psp_device *psp, unsigned int timeout_msecs, diff --git a/drivers/crypto/ccp/sfs.c b/drivers/crypto/ccp/sfs.c new file mode 100644 index 000000000000..2f4beaafe7ec --- /dev/null +++ b/drivers/crypto/ccp/sfs.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Processor Seamless Firmware Servicing support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#include + +#include "sfs.h" +#include "sev-dev.h" + +#define SFS_DEFAULT_TIMEOUT (10 * MSEC_PER_SEC) +#define SFS_MAX_PAYLOAD_SIZE (2 * 1024 * 1024) +#define SFS_NUM_2MB_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PMD_SIZE) +#define SFS_NUM_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PAGE_SIZE) + +static DEFINE_MUTEX(sfs_ioctl_mutex); + +static struct sfs_misc_dev *misc_dev; + +static int send_sfs_cmd(struct sfs_device *sfs_dev, int msg) +{ + int ret; + + sfs_dev->command_buf->hdr.status = 0; + sfs_dev->command_buf->hdr.sub_cmd_id = msg; + + ret = psp_extended_mailbox_cmd(sfs_dev->psp, + SFS_DEFAULT_TIMEOUT, + (struct psp_ext_request *)sfs_dev->command_buf); + if (ret == -EIO) { + dev_dbg(sfs_dev->dev, + "msg 0x%x failed with PSP error: 0x%x, extended status: 0x%x\n", + msg, sfs_dev->command_buf->hdr.status, + *(u32 *)sfs_dev->command_buf->buf); + } + + return ret; +} + +static int send_sfs_get_fw_versions(struct sfs_device *sfs_dev) +{ + /* + * SFS_GET_FW_VERSIONS command needs the output buffer to be + * initialized to 0xC7 in every byte. + */ + memset(sfs_dev->command_buf->sfs_buffer, 0xc7, PAGE_SIZE); + sfs_dev->command_buf->hdr.payload_size = 2 * PAGE_SIZE; + + return send_sfs_cmd(sfs_dev, PSP_SFS_GET_FW_VERSIONS); +} + +static int send_sfs_update_package(struct sfs_device *sfs_dev, const char *payload_name) +{ + char payload_path[PAYLOAD_NAME_SIZE + sizeof("amd/")]; + const struct firmware *firmware; + unsigned long package_size; + int ret; + + /* Sanitize userspace provided payload name */ + if (!strnchr(payload_name, PAYLOAD_NAME_SIZE, '\0')) + return -EINVAL; + + snprintf(payload_path, sizeof(payload_path), "amd/%s", payload_name); + + ret = firmware_request_nowarn(&firmware, payload_path, sfs_dev->dev); + if (ret < 0) { + dev_warn_ratelimited(sfs_dev->dev, "firmware request failed for %s (%d)\n", + payload_path, ret); + return -ENOENT; + } + + /* + * SFS Update Package command's input buffer contains TEE_EXT_CMD_BUFFER + * followed by the Update Package and it should be 64KB aligned. + */ + package_size = ALIGN(firmware->size + PAGE_SIZE, 0x10000U); + + /* + * SFS command buffer is a pre-allocated 2MB buffer, fail update package + * if SFS payload is larger than the pre-allocated command buffer. + */ + if (package_size > SFS_MAX_PAYLOAD_SIZE) { + dev_warn_ratelimited(sfs_dev->dev, + "SFS payload size %ld larger than maximum supported payload size of %u\n", + package_size, SFS_MAX_PAYLOAD_SIZE); + release_firmware(firmware); + return -E2BIG; + } + + /* + * Copy firmware data to a HV_Fixed memory region. + */ + memcpy(sfs_dev->command_buf->sfs_buffer, firmware->data, firmware->size); + sfs_dev->command_buf->hdr.payload_size = package_size; + + release_firmware(firmware); + + return send_sfs_cmd(sfs_dev, PSP_SFS_UPDATE); +} + +static long sfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct sfs_user_get_fw_versions __user *sfs_get_fw_versions; + struct sfs_user_update_package __user *sfs_update_package; + struct psp_device *psp_master = psp_get_master_device(); + char payload_name[PAYLOAD_NAME_SIZE]; + struct sfs_device *sfs_dev; + int ret = 0; + + if (!psp_master || !psp_master->sfs_data) + return -ENODEV; + + sfs_dev = psp_master->sfs_data; + + guard(mutex)(&sfs_ioctl_mutex); + + switch (cmd) { + case SFSIOCFWVERS: + dev_dbg(sfs_dev->dev, "in SFSIOCFWVERS\n"); + + sfs_get_fw_versions = (struct sfs_user_get_fw_versions __user *)arg; + + ret = send_sfs_get_fw_versions(sfs_dev); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_get_fw_versions->blob, sfs_dev->command_buf->sfs_buffer, + PAGE_SIZE)) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_get_fw_versions->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_get_fw_versions->sfs_extended_status))) + return -EFAULT; + break; + case SFSIOCUPDATEPKG: + dev_dbg(sfs_dev->dev, "in SFSIOCUPDATEPKG\n"); + + sfs_update_package = (struct sfs_user_update_package __user *)arg; + + if (copy_from_user(payload_name, sfs_update_package->payload_name, + PAYLOAD_NAME_SIZE)) + return -EFAULT; + + ret = send_sfs_update_package(sfs_dev, payload_name); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_update_package->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_update_package->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_update_package->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_update_package->sfs_extended_status))) + return -EFAULT; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static const struct file_operations sfs_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = sfs_ioctl, +}; + +static void sfs_exit(struct kref *ref) +{ + misc_deregister(&misc_dev->misc); + kfree(misc_dev); + misc_dev = NULL; +} + +void sfs_dev_destroy(struct psp_device *psp) +{ + struct sfs_device *sfs_dev = psp->sfs_data; + + if (!sfs_dev) + return; + + /* + * Change SFS command buffer back to the default "Write-Back" type. + */ + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + + snp_free_hv_fixed_pages(sfs_dev->page); + + if (sfs_dev->misc) + kref_put(&misc_dev->refcount, sfs_exit); + + psp->sfs_data = NULL; +} + +/* Based on sev_misc_init() */ +static int sfs_misc_init(struct sfs_device *sfs) +{ + struct device *dev = sfs->dev; + int ret; + + /* + * SFS feature support can be detected on multiple devices but the SFS + * FW commands must be issued on the master. During probe, we do not + * know the master hence we create /dev/sfs on the first device probe. + */ + if (!misc_dev) { + struct miscdevice *misc; + + misc_dev = kzalloc(sizeof(*misc_dev), GFP_KERNEL); + if (!misc_dev) + return -ENOMEM; + + misc = &misc_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = "sfs"; + misc->fops = &sfs_fops; + misc->mode = 0600; + + ret = misc_register(misc); + if (ret) + return ret; + + kref_init(&misc_dev->refcount); + } else { + kref_get(&misc_dev->refcount); + } + + sfs->misc = misc_dev; + dev_dbg(dev, "registered SFS device\n"); + + return 0; +} + +int sfs_dev_init(struct psp_device *psp) +{ + struct device *dev = psp->dev; + struct sfs_device *sfs_dev; + struct page *page; + int ret = -ENOMEM; + + sfs_dev = devm_kzalloc(dev, sizeof(*sfs_dev), GFP_KERNEL); + if (!sfs_dev) + return -ENOMEM; + + /* + * Pre-allocate 2MB command buffer for all SFS commands using + * SNP HV_Fixed page allocator which also transitions the + * SFS command buffer to HV_Fixed page state if SNP is enabled. + */ + page = snp_alloc_hv_fixed_pages(SFS_NUM_2MB_PAGES_CMDBUF); + if (!page) { + dev_dbg(dev, "Command Buffer HV-Fixed page allocation failed\n"); + goto cleanup_dev; + } + sfs_dev->page = page; + sfs_dev->command_buf = page_address(page); + + dev_dbg(dev, "Command buffer 0x%px to be marked as HV_Fixed\n", sfs_dev->command_buf); + + /* + * SFS command buffer must be mapped as non-cacheable. + */ + ret = set_memory_uc((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + if (ret) { + dev_dbg(dev, "Set memory uc failed\n"); + goto cleanup_cmd_buf; + } + + dev_dbg(dev, "Command buffer 0x%px marked uncacheable\n", sfs_dev->command_buf); + + psp->sfs_data = sfs_dev; + sfs_dev->dev = dev; + sfs_dev->psp = psp; + + ret = sfs_misc_init(sfs_dev); + if (ret) + goto cleanup_mem_attr; + + dev_notice(sfs_dev->dev, "SFS support is available\n"); + + return 0; + +cleanup_mem_attr: + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + +cleanup_cmd_buf: + snp_free_hv_fixed_pages(page); + +cleanup_dev: + psp->sfs_data = NULL; + devm_kfree(dev, sfs_dev); + + return ret; +} diff --git a/drivers/crypto/ccp/sfs.h b/drivers/crypto/ccp/sfs.h new file mode 100644 index 000000000000..97704c210efd --- /dev/null +++ b/drivers/crypto/ccp/sfs.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Platform Security Processor (PSP) Seamless Firmware (SFS) Support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __SFS_H__ +#define __SFS_H__ + +#include + +#include +#include +#include +#include +#include + +#include "psp-dev.h" + +struct sfs_misc_dev { + struct kref refcount; + struct miscdevice misc; +}; + +struct sfs_command { + struct psp_ext_req_buffer_hdr hdr; + u8 buf[PAGE_SIZE - sizeof(struct psp_ext_req_buffer_hdr)]; + u8 sfs_buffer[]; +} __packed; + +struct sfs_device { + struct device *dev; + struct psp_device *psp; + + struct page *page; + struct sfs_command *command_buf; + + struct sfs_misc_dev *misc; +}; + +void sfs_dev_destroy(struct psp_device *psp); +int sfs_dev_init(struct psp_device *psp); + +#endif /* __SFS_H__ */ diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 1504fb012c05..540abf7de048 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,8 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_SFS_GET_FW_VERSIONS, + PSP_SFS_UPDATE, PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, diff --git a/include/uapi/linux/psp-sfs.h b/include/uapi/linux/psp-sfs.h new file mode 100644 index 000000000000..94e51670383c --- /dev/null +++ b/include/uapi/linux/psp-sfs.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Userspace interface for AMD Seamless Firmware Servicing (SFS) + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __PSP_SFS_USER_H__ +#define __PSP_SFS_USER_H__ + +#include + +/** + * SFS: AMD Seamless Firmware Support (SFS) interface + */ + +#define PAYLOAD_NAME_SIZE 64 +#define TEE_EXT_CMD_BUFFER_SIZE 4096 + +/** + * struct sfs_user_get_fw_versions - get current level of base firmware (output). + * @blob: current level of base firmware for ASP and patch levels (input/output). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_get_fw_versions { + __u8 blob[TEE_EXT_CMD_BUFFER_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * struct sfs_user_update_package - update SFS package (input). + * @payload_name: name of SFS package to load, verify and execute (input). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_update_package { + char payload_name[PAYLOAD_NAME_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * Seamless Firmware Support (SFS) IOC + * + * possible return codes for all SFS IOCTLs: + * 0: success + * -EINVAL: invalid input + * -E2BIG: excess data passed + * -EFAULT: failed to copy to/from userspace + * -EBUSY: mailbox in recovery or in use + * -ENODEV: driver not bound with PSP device + * -EACCES: request isn't authorized + * -EINVAL: invalid parameter + * -ETIMEDOUT: request timed out + * -EAGAIN: invalid request for state machine + * -ENOENT: not implemented + * -ENFILE: overflow + * -EPERM: invalid signature + * -EIO: PSP I/O error + */ +#define SFS_IOC_TYPE 'S' + +/** + * SFSIOCFWVERS - returns blob containing FW versions + * ASP provides the current level of Base Firmware for the ASP + * and the other microprocessors as well as current patch + * level(s). + */ +#define SFSIOCFWVERS _IOWR(SFS_IOC_TYPE, 0x1, struct sfs_user_get_fw_versions) + +/** + * SFSIOCUPDATEPKG - updates package/payload + * ASP loads, verifies and executes the SFS package. + * By default, the SFS package/payload is loaded from + * /lib/firmware/amd, but alternative firmware loading + * path can be specified using kernel parameter + * firmware_class.path or the firmware loading path + * can be customized using sysfs file: + * /sys/module/firmware_class/parameters/path. + */ +#define SFSIOCUPDATEPKG _IOWR(SFS_IOC_TYPE, 0x2, struct sfs_user_update_package) + +#endif /* __PSP_SFS_USER_H__ */ -- cgit v1.2.3 From 45fe729be9a6be326a1ca25af82d34de32ba2ce8 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 16 Sep 2025 10:16:20 +0800 Subject: usb: typec: Stub out typec_switch APIs when CONFIG_TYPEC=n Ease driver development by adding stubs for the typec_switch APIs when CONFIG_TYPEC=n. Copy the same method used for the typec_mux APIs to be consistent. Acked-by: Greg Kroah-Hartman Reviewed-by: Heikki Krogerus Signed-off-by: Stephen Boyd Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20250916021620.1303995-1-xu.yang_2@nxp.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_mux.h | 46 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/typec_mux.h b/include/linux/usb/typec_mux.h index 2489a7857d8e..aa9ebb7e2fe0 100644 --- a/include/linux/usb/typec_mux.h +++ b/include/linux/usb/typec_mux.h @@ -3,6 +3,7 @@ #ifndef __USB_TYPEC_MUX #define __USB_TYPEC_MUX +#include #include #include @@ -24,16 +25,13 @@ struct typec_switch_desc { void *drvdata; }; +#if IS_ENABLED(CONFIG_TYPEC) + struct typec_switch *fwnode_typec_switch_get(struct fwnode_handle *fwnode); void typec_switch_put(struct typec_switch *sw); int typec_switch_set(struct typec_switch *sw, enum typec_orientation orientation); -static inline struct typec_switch *typec_switch_get(struct device *dev) -{ - return fwnode_typec_switch_get(dev_fwnode(dev)); -} - struct typec_switch_dev * typec_switch_register(struct device *parent, const struct typec_switch_desc *desc); @@ -42,6 +40,44 @@ void typec_switch_unregister(struct typec_switch_dev *sw); void typec_switch_set_drvdata(struct typec_switch_dev *sw, void *data); void *typec_switch_get_drvdata(struct typec_switch_dev *sw); +#else + +static inline struct typec_switch * +fwnode_typec_switch_get(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline void typec_switch_put(struct typec_switch *sw) {} + +static inline int typec_switch_set(struct typec_switch *sw, + enum typec_orientation orientation) +{ + return 0; +} + +static inline struct typec_switch_dev * +typec_switch_register(struct device *parent, + const struct typec_switch_desc *desc) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void typec_switch_unregister(struct typec_switch_dev *sw) {} + +static inline void typec_switch_set_drvdata(struct typec_switch_dev *sw, void *data) {} +static inline void *typec_switch_get_drvdata(struct typec_switch_dev *sw) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +#endif /* CONFIG_TYPEC */ + +static inline struct typec_switch *typec_switch_get(struct device *dev) +{ + return fwnode_typec_switch_get(dev_fwnode(dev)); +} + struct typec_mux_state { struct typec_altmode *alt; unsigned long mode; -- cgit v1.2.3 From bfb1d99d969fe3b892db30848aeebfa19d21f57f Mon Sep 17 00:00:00 2001 From: Kuen-Han Tsai Date: Tue, 16 Sep 2025 16:21:32 +0800 Subject: usb: gadget: Store endpoint pointer in usb_request Gadget function drivers often have goto-based error handling in their bind paths, which can be bug-prone. Refactoring these paths to use __free() scope-based cleanup is desirable, but currently blocked. The blocker is that usb_ep_free_request(ep, req) requires two parameters, while the __free() mechanism can only pass a pointer to the request itself. Store an endpoint pointer in the struct usb_request. The pointer is populated centrally in usb_ep_alloc_request() on every successful allocation, making the request object self-contained. Signed-off-by: Kuen-Han Tsai Link: https://lore.kernel.org/r/20250916-ready-v1-1-4997bf277548@google.com Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250916-ready-v1-1-4997bf277548@google.com --- drivers/usb/gadget/udc/core.c | 3 +++ include/linux/usb/gadget.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index d709e24c1fd4..e3d63b8fa0f4 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -194,6 +194,9 @@ struct usb_request *usb_ep_alloc_request(struct usb_ep *ep, req = ep->ops->alloc_request(ep, gfp_flags); + if (req) + req->ep = ep; + trace_usb_ep_alloc_request(ep, req, req ? 0 : -ENOMEM); return req; diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 0f28c5512fcb..0f2079476088 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -32,6 +32,7 @@ struct usb_ep; /** * struct usb_request - describes one i/o request + * @ep: The associated endpoint set by usb_ep_alloc_request(). * @buf: Buffer used for data. Always provide this; some controllers * only use PIO, or don't use DMA for some endpoints. * @dma: DMA address corresponding to 'buf'. If you don't set this @@ -98,6 +99,7 @@ struct usb_ep; */ struct usb_request { + struct usb_ep *ep; void *buf; unsigned length; dma_addr_t dma; -- cgit v1.2.3 From 201c53c687f2b55a7cc6d9f4000af4797860174b Mon Sep 17 00:00:00 2001 From: Kuen-Han Tsai Date: Tue, 16 Sep 2025 16:21:33 +0800 Subject: usb: gadget: Introduce free_usb_request helper Introduce the free_usb_request() function that frees both the request's buffer and the request itself. This function serves as the cleanup callback for DEFINE_FREE() to enable automatic, scope-based cleanup for usb_request pointers. Signed-off-by: Kuen-Han Tsai Link: https://lore.kernel.org/r/20250916-ready-v1-2-4997bf277548@google.com Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250916-ready-v1-2-4997bf277548@google.com --- include/linux/usb/gadget.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 0f2079476088..3aaf19e77558 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -15,6 +15,7 @@ #ifndef __LINUX_USB_GADGET_H #define __LINUX_USB_GADGET_H +#include #include #include #include @@ -293,6 +294,28 @@ static inline void usb_ep_fifo_flush(struct usb_ep *ep) /*-------------------------------------------------------------------------*/ +/** + * free_usb_request - frees a usb_request object and its buffer + * @req: the request being freed + * + * This helper function frees both the request's buffer and the request object + * itself by calling usb_ep_free_request(). Its signature is designed to be used + * with DEFINE_FREE() to enable automatic, scope-based cleanup for usb_request + * pointers. + */ +static inline void free_usb_request(struct usb_request *req) +{ + if (!req) + return; + + kfree(req->buf); + usb_ep_free_request(req->ep, req); +} + +DEFINE_FREE(free_usb_request, struct usb_request *, free_usb_request(_T)) + +/*-------------------------------------------------------------------------*/ + struct usb_dcd_config_params { __u8 bU1devExitLat; /* U1 Device exit Latency */ #define USB_DEFAULT_U1_DEV_EXIT_LAT 0x01 /* Less then 1 microsec */ -- cgit v1.2.3 From 413187f79062634575098653c40f95d439d3b157 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 17 Sep 2025 15:26:49 +0200 Subject: stddef: Remove token-pasting in TRAILING_OVERLAP() Currently, TRAILING_OVERLAP() token-pastes the FAM parameter into the name of internal pdding member `__offset_to_##FAM`. This forces FAM to be a single identifier, which prevents callers from using a FAM when it's a nested member. For instance, see the following scenario: | struct flex { | size_t count; | int data[]; | }; | struct foo { | int hdr_foo; | struct flex f; | }; | struct composite { | struct foo hdr; | int data[100]; | }; In this case, it'd be useful if TRAILING_OVERLAP() could be used in the following way: | struct composite { | TRAILING_OVERLAP(struct foo, hdr, f.data, | int data[100]; | ); | }; However, this is not current possible due to the token concatenation in `__offset_to_##FAM`, which fails when FAM contains a dot. So, remove token-pasting and use the fixed internal name `__offset_to_FAM` and, with this, expand the capabilities of TRAILING_OVERLAP(). :) Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/13b3e0a69aad837b4e32ca8269b9d91bf1fbe9ef.1758115257.git.gustavoars@kernel.org Signed-off-by: Kees Cook --- include/linux/stddef.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stddef.h b/include/linux/stddef.h index dab49e2ec8c0..701099c67c24 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -108,7 +108,7 @@ enum { union { \ TYPE NAME; \ struct { \ - unsigned char __offset_to_##FAM[offsetof(TYPE, FAM)]; \ + unsigned char __offset_to_FAM[offsetof(TYPE, FAM)]; \ MEMBERS \ }; \ } -- cgit v1.2.3 From 2bbdcf02c3f3df6c85abd0a2fee0d7f0f4113e96 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 17 Sep 2025 15:28:07 +0200 Subject: stddef: Introduce __TRAILING_OVERLAP() Introduce underlying __TRAILING_OVERLAP() macro to let callers apply atributes to trailing overlapping members. For instance, the code below: | struct flex { | size_t count; | int data[]; | }; | struct { | struct flex f; | struct foo a; | struct boo b; | } __packed instance; can now be changed to the following, and preserve the __packed attribute: | __TRAILING_OVERLAP(struct flex, f, data, __packed, | struct foo a; | struct boo b; | ) instance; Signed-off-by: Gustavo A. R. Silva Link: https://lore.kernel.org/r/f80c529b239ce11f0a51f714fe00ddf839e05f5e.1758115257.git.gustavoars@kernel.org Signed-off-by: Kees Cook --- include/linux/stddef.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stddef.h b/include/linux/stddef.h index 701099c67c24..80b6bfb944f0 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -94,7 +94,8 @@ enum { __DECLARE_FLEX_ARRAY(TYPE, NAME) /** - * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members. + * __TRAILING_OVERLAP() - Overlap a flexible-array member with trailing + * members. * * Creates a union between a flexible-array member (FAM) in a struct and a set * of additional members that would otherwise follow it. @@ -102,15 +103,30 @@ enum { * @TYPE: Flexible structure type name, including "struct" keyword. * @NAME: Name for a variable to define. * @FAM: The flexible-array member within @TYPE + * @ATTRS: Any struct attributes (usually empty) * @MEMBERS: Trailing overlapping members. */ -#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS) \ +#define __TRAILING_OVERLAP(TYPE, NAME, FAM, ATTRS, MEMBERS) \ union { \ TYPE NAME; \ struct { \ unsigned char __offset_to_FAM[offsetof(TYPE, FAM)]; \ MEMBERS \ - }; \ + } ATTRS; \ } +/** + * TRAILING_OVERLAP() - Overlap a flexible-array member with trailing members. + * + * Creates a union between a flexible-array member (FAM) in a struct and a set + * of additional members that would otherwise follow it. + * + * @TYPE: Flexible structure type name, including "struct" keyword. + * @NAME: Name for a variable to define. + * @FAM: The flexible-array member within @TYPE + * @MEMBERS: Trailing overlapping members. + */ +#define TRAILING_OVERLAP(TYPE, NAME, FAM, MEMBERS) \ + __TRAILING_OVERLAP(TYPE, NAME, FAM, /* no attrs */, MEMBERS) + #endif -- cgit v1.2.3 From 5c5db9efe323dd0b0d7917dbe5b9c0999c95e79e Mon Sep 17 00:00:00 2001 From: Sascha Bischoff Date: Thu, 28 Aug 2025 10:59:43 +0000 Subject: irqchip/gic-v5: Drop has_gcie_v3_compat from gic_kvm_info The presence of FEAT_GCIE_LEGACY is now handled as a CPU feature. Therefore, drop the check and flag from the GIC driver and gic_kvm_info as it is no longer required or used by KVM. Signed-off-by: Sascha Bischoff Acked-by: Thomas Gleixner Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v5.c | 7 ------- include/linux/irqchip/arm-vgic-info.h | 2 -- 2 files changed, 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v5.c b/drivers/irqchip/irq-gic-v5.c index 4bd224f359a7..41ef286c4d78 100644 --- a/drivers/irqchip/irq-gic-v5.c +++ b/drivers/irqchip/irq-gic-v5.c @@ -1062,16 +1062,9 @@ static void gicv5_set_cpuif_idbits(void) #ifdef CONFIG_KVM static struct gic_kvm_info gic_v5_kvm_info __initdata; -static bool __init gicv5_cpuif_has_gcie_legacy(void) -{ - u64 idr0 = read_sysreg_s(SYS_ICC_IDR0_EL1); - return !!FIELD_GET(ICC_IDR0_EL1_GCIE_LEGACY, idr0); -} - static void __init gic_of_setup_kvm_info(struct device_node *node) { gic_v5_kvm_info.type = GIC_V5; - gic_v5_kvm_info.has_gcie_v3_compat = gicv5_cpuif_has_gcie_legacy(); /* GIC Virtual CPU interface maintenance interrupt */ gic_v5_kvm_info.no_maint_irq_mask = false; diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h index ca1713fac6e3..a470a73a805a 100644 --- a/include/linux/irqchip/arm-vgic-info.h +++ b/include/linux/irqchip/arm-vgic-info.h @@ -36,8 +36,6 @@ struct gic_kvm_info { bool has_v4_1; /* Deactivation impared, subpar stuff */ bool no_hw_deactivation; - /* v3 compat support (GICv5 hosts, only) */ - bool has_gcie_v3_compat; }; #ifdef CONFIG_KVM -- cgit v1.2.3 From 652b08afba69d5d26fe91098eb832b1bcc0f91c2 Mon Sep 17 00:00:00 2001 From: Cristian Birsan Date: Thu, 21 Nov 2024 20:16:38 +0200 Subject: ARM: at91: remove default values for PMC_PLL_ACR Remove default values for PMC PLL Analog Control Register(ACR) as the values are specific for each SoC and PLL and load them from PLL characteristics structure Co-developed-by: Andrei Simion Signed-off-by: Andrei Simion Signed-off-by: Cristian Birsan [nicolas.ferre@microchip.com: fix pll acr write sequence, preserve val] Signed-off-by: Nicolas Ferre --- drivers/clk/at91/clk-sam9x60-pll.c | 7 ++----- include/linux/clk/at91_pmc.h | 2 -- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/at91/clk-sam9x60-pll.c b/drivers/clk/at91/clk-sam9x60-pll.c index a035dc15454b..3dc75a394ce1 100644 --- a/drivers/clk/at91/clk-sam9x60-pll.c +++ b/drivers/clk/at91/clk-sam9x60-pll.c @@ -103,11 +103,8 @@ static int sam9x60_frac_pll_set(struct sam9x60_pll_core *core) (cmul == frac->mul && cfrac == frac->frac)) goto unlock; - /* Recommended value for PMC_PLL_ACR */ - if (core->characteristics->upll) - val = AT91_PMC_PLL_ACR_DEFAULT_UPLL; - else - val = AT91_PMC_PLL_ACR_DEFAULT_PLLA; + /* Load recommended value for PMC_PLL_ACR */ + val = core->characteristics->acr; regmap_write(regmap, AT91_PMC_PLL_ACR, val); regmap_write(regmap, AT91_PMC_PLL_CTRL1, diff --git a/include/linux/clk/at91_pmc.h b/include/linux/clk/at91_pmc.h index 7af499bdbecb..d60ce9708ea2 100644 --- a/include/linux/clk/at91_pmc.h +++ b/include/linux/clk/at91_pmc.h @@ -47,8 +47,6 @@ #define AT91_PMC_PCSR 0x18 /* Peripheral Clock Status Register */ #define AT91_PMC_PLL_ACR 0x18 /* PLL Analog Control Register [for SAM9X60] */ -#define AT91_PMC_PLL_ACR_DEFAULT_UPLL UL(0x12020010) /* Default PLL ACR value for UPLL */ -#define AT91_PMC_PLL_ACR_DEFAULT_PLLA UL(0x00020010) /* Default PLL ACR value for PLLA */ #define AT91_PMC_PLL_ACR_UTMIVR (1 << 12) /* UPLL Voltage regulator Control */ #define AT91_PMC_PLL_ACR_UTMIBG (1 << 13) /* UPLL Bandgap Control */ -- cgit v1.2.3 From d4bf06592ad68ac4353a81c73e8e662cf88aa2cc Mon Sep 17 00:00:00 2001 From: Viken Dadhaniya Date: Thu, 11 Sep 2025 10:02:53 +0530 Subject: soc: qcom: geni-se: Add support to load QUP SE Firmware via Linux subsystem In Qualcomm SoCs, firmware loading for Serial Engines (SE) within the QUP hardware has traditionally been managed by TrustZone (TZ). This restriction poses a significant challenge for developers, as it limits their ability to enable various protocols on any of the SEs from the Linux side, reducing flexibility. Load the firmware to QUP SE based on the 'firmware-name' property specified in devicetree at bootup time. Co-developed-by: Mukesh Kumar Savaliya Signed-off-by: Mukesh Kumar Savaliya Signed-off-by: Viken Dadhaniya Link: https://lore.kernel.org/r/20250911043256.3523057-4-viken.dadhaniya@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/qcom-geni-se.c | 474 ++++++++++++++++++++++++++++++++++++++- include/linux/soc/qcom/geni-se.h | 4 + 2 files changed, 475 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/soc/qcom/qcom-geni-se.c b/drivers/soc/qcom/qcom-geni-se.c index e8ab2833815e..cd1779b6a91a 100644 --- a/drivers/soc/qcom/qcom-geni-se.c +++ b/drivers/soc/qcom/qcom-geni-se.c @@ -8,7 +8,9 @@ #define __DISABLE_TRACE_MMIO__ #include +#include #include +#include #include #include #include @@ -113,8 +115,80 @@ struct geni_se_desc { static const char * const icc_path_names[] = {"qup-core", "qup-config", "qup-memory"}; +static const char * const protocol_name[] = { "None", "SPI", "UART", "I2C", "I3C", "SPI SLAVE" }; + +/** + * struct se_fw_hdr - Serial Engine firmware configuration header + * + * This structure defines the SE firmware header, which together with the + * firmware payload is stored in individual ELF segments. + * + * @magic: Set to 'SEFW'. + * @version: Structure version number. + * @core_version: QUPV3 hardware version. + * @serial_protocol: Encoded in GENI_FW_REVISION. + * @fw_version: Firmware version, from GENI_FW_REVISION. + * @cfg_version: Configuration version, from GENI_INIT_CFG_REVISION. + * @fw_size_in_items: Number of 32-bit words in GENI_FW_RAM. + * @fw_offset: Byte offset to GENI_FW_RAM array. + * @cfg_size_in_items: Number of GENI_FW_CFG index/value pairs. + * @cfg_idx_offset: Byte offset to GENI_FW_CFG index array. + * @cfg_val_offset: Byte offset to GENI_FW_CFG values array. + */ +struct se_fw_hdr { + __le32 magic; + __le32 version; + __le32 core_version; + __le16 serial_protocol; + __le16 fw_version; + __le16 cfg_version; + __le16 fw_size_in_items; + __le16 fw_offset; + __le16 cfg_size_in_items; + __le16 cfg_idx_offset; + __le16 cfg_val_offset; +}; + +/*Magic numbers*/ +#define SE_MAGIC_NUM 0x57464553 + +#define MAX_GENI_CFG_RAMn_CNT 455 + +#define MI_PBT_NON_PAGED_SEGMENT 0x0 +#define MI_PBT_HASH_SEGMENT 0x2 +#define MI_PBT_NOTUSED_SEGMENT 0x3 +#define MI_PBT_SHARED_SEGMENT 0x4 + +#define MI_PBT_FLAG_PAGE_MODE BIT(20) +#define MI_PBT_FLAG_SEGMENT_TYPE GENMASK(26, 24) +#define MI_PBT_FLAG_ACCESS_TYPE GENMASK(23, 21) + +#define MI_PBT_PAGE_MODE_VALUE(x) FIELD_GET(MI_PBT_FLAG_PAGE_MODE, x) + +#define MI_PBT_SEGMENT_TYPE_VALUE(x) FIELD_GET(MI_PBT_FLAG_SEGMENT_TYPE, x) + +#define MI_PBT_ACCESS_TYPE_VALUE(x) FIELD_GET(MI_PBT_FLAG_ACCESS_TYPE, x) + +#define M_COMMON_GENI_M_IRQ_EN (GENMASK(6, 1) | \ + M_IO_DATA_DEASSERT_EN | \ + M_IO_DATA_ASSERT_EN | M_RX_FIFO_RD_ERR_EN | \ + M_RX_FIFO_WR_ERR_EN | M_TX_FIFO_RD_ERR_EN | \ + M_TX_FIFO_WR_ERR_EN) + /* Common QUPV3 registers */ #define QUPV3_HW_VER_REG 0x4 +#define QUPV3_SE_AHB_M_CFG 0x118 +#define QUPV3_COMMON_CFG 0x120 +#define QUPV3_COMMON_CGC_CTRL 0x21c + +/* QUPV3_COMMON_CFG fields */ +#define FAST_SWITCH_TO_HIGH_DISABLE BIT(0) + +/* QUPV3_SE_AHB_M_CFG fields */ +#define AHB_M_CLK_CGC_ON BIT(0) + +/* QUPV3_COMMON_CGC_CTRL fields */ +#define COMMON_CSR_SLV_CLK_CGC_ON BIT(0) /* Common SE registers */ #define SE_GENI_INIT_CFG_REVISION 0x0 @@ -122,11 +196,13 @@ static const char * const icc_path_names[] = {"qup-core", "qup-config", #define SE_GENI_CGC_CTRL 0x28 #define SE_GENI_CLK_CTRL_RO 0x60 #define SE_GENI_FW_S_REVISION_RO 0x6c +#define SE_GENI_CFG_REG0 0x100 #define SE_GENI_BYTE_GRAN 0x254 #define SE_GENI_TX_PACKING_CFG0 0x260 #define SE_GENI_TX_PACKING_CFG1 0x264 #define SE_GENI_RX_PACKING_CFG0 0x284 #define SE_GENI_RX_PACKING_CFG1 0x288 +#define SE_GENI_S_IRQ_ENABLE 0x644 #define SE_DMA_TX_PTR_L 0xc30 #define SE_DMA_TX_PTR_H 0xc34 #define SE_DMA_TX_ATTR 0xc38 @@ -148,6 +224,15 @@ static const char * const icc_path_names[] = {"qup-core", "qup-config", #define SE_GSI_EVENT_EN 0xe18 #define SE_IRQ_EN 0xe1c #define SE_DMA_GENERAL_CFG 0xe30 +#define SE_GENI_FW_REVISION 0x1000 +#define SE_GENI_S_FW_REVISION 0x1004 +#define SE_GENI_CFG_RAMN 0x1010 +#define SE_GENI_CLK_CTRL 0x2000 +#define SE_DMA_IF_EN 0x2004 +#define SE_FIFO_IF_DISABLE 0x2008 + +/* GENI_FW_REVISION_RO fields */ +#define FW_REV_VERSION_MSK GENMASK(7, 0) /* GENI_OUTPUT_CTRL fields */ #define DEFAULT_IO_OUTPUT_CTRL_MSK GENMASK(6, 0) @@ -186,6 +271,15 @@ static const char * const icc_path_names[] = {"qup-core", "qup-config", #define RX_DMA_IRQ_DELAY_MSK GENMASK(8, 6) #define RX_DMA_IRQ_DELAY_SHFT 6 +/* GENI_CLK_CTRL fields */ +#define SER_CLK_SEL BIT(0) + +/* GENI_DMA_IF_EN fields */ +#define DMA_IF_EN BIT(0) + +#define geni_setbits32(_addr, _v) writel(readl(_addr) | (_v), _addr) +#define geni_clrbits32(_addr, _v) writel(readl(_addr) & ~(_v), _addr) + /** * geni_se_get_qup_hw_version() - Read the QUP wrapper Hardware version * @se: Pointer to the corresponding serial engine. @@ -658,9 +752,12 @@ int geni_se_clk_freq_match(struct geni_se *se, unsigned long req_freq, } EXPORT_SYMBOL_GPL(geni_se_clk_freq_match); -#define GENI_SE_DMA_DONE_EN BIT(0) -#define GENI_SE_DMA_EOT_EN BIT(1) -#define GENI_SE_DMA_AHB_ERR_EN BIT(2) +#define GENI_SE_DMA_DONE_EN BIT(0) +#define GENI_SE_DMA_EOT_EN BIT(1) +#define GENI_SE_DMA_AHB_ERR_EN BIT(2) +#define GENI_SE_DMA_RESET_DONE_EN BIT(3) +#define GENI_SE_DMA_FLUSH_DONE BIT(4) + #define GENI_SE_DMA_EOT_BUF BIT(0) /** @@ -891,6 +988,377 @@ int geni_icc_disable(struct geni_se *se) } EXPORT_SYMBOL_GPL(geni_icc_disable); +/** + * geni_find_protocol_fw() - Locate and validate SE firmware for a protocol. + * @dev: Pointer to the device structure. + * @fw: Pointer to the firmware image. + * @protocol: Expected serial engine protocol type. + * + * Identifies the appropriate firmware image or configuration required for a + * specific communication protocol instance running on a Qualcomm GENI + * controller. + * + * Return: pointer to a valid 'struct se_fw_hdr' if found, or NULL otherwise. + */ +static struct se_fw_hdr *geni_find_protocol_fw(struct device *dev, const struct firmware *fw, + enum geni_se_protocol_type protocol) +{ + const struct elf32_hdr *ehdr; + const struct elf32_phdr *phdrs; + const struct elf32_phdr *phdr; + struct se_fw_hdr *sefw; + u32 fw_end, cfg_idx_end, cfg_val_end; + u16 fw_size; + int i; + + if (!fw || fw->size < sizeof(struct elf32_hdr)) + return NULL; + + ehdr = (const struct elf32_hdr *)fw->data; + phdrs = (const struct elf32_phdr *)(fw->data + ehdr->e_phoff); + + /* + * The firmware is expected to have at least two program headers (segments). + * One for metadata and the other for the actual protocol-specific firmware. + */ + if (ehdr->e_phnum < 2) { + dev_err(dev, "Invalid firmware: less than 2 program headers\n"); + return NULL; + } + + for (i = 0; i < ehdr->e_phnum; i++) { + phdr = &phdrs[i]; + + if (fw->size < phdr->p_offset + phdr->p_filesz) { + dev_err(dev, "Firmware size (%zu) < expected offset (%u) + size (%u)\n", + fw->size, phdr->p_offset, phdr->p_filesz); + return NULL; + } + + if (phdr->p_type != PT_LOAD || !phdr->p_memsz) + continue; + + if (MI_PBT_PAGE_MODE_VALUE(phdr->p_flags) != MI_PBT_NON_PAGED_SEGMENT || + MI_PBT_SEGMENT_TYPE_VALUE(phdr->p_flags) == MI_PBT_HASH_SEGMENT || + MI_PBT_ACCESS_TYPE_VALUE(phdr->p_flags) == MI_PBT_NOTUSED_SEGMENT || + MI_PBT_ACCESS_TYPE_VALUE(phdr->p_flags) == MI_PBT_SHARED_SEGMENT) + continue; + + if (phdr->p_filesz < sizeof(struct se_fw_hdr)) + continue; + + sefw = (struct se_fw_hdr *)(fw->data + phdr->p_offset); + fw_size = le16_to_cpu(sefw->fw_size_in_items); + fw_end = le16_to_cpu(sefw->fw_offset) + fw_size * sizeof(u32); + cfg_idx_end = le16_to_cpu(sefw->cfg_idx_offset) + + le16_to_cpu(sefw->cfg_size_in_items) * sizeof(u8); + cfg_val_end = le16_to_cpu(sefw->cfg_val_offset) + + le16_to_cpu(sefw->cfg_size_in_items) * sizeof(u32); + + if (le32_to_cpu(sefw->magic) != SE_MAGIC_NUM || le32_to_cpu(sefw->version) != 1) + continue; + + if (le32_to_cpu(sefw->serial_protocol) != protocol) + continue; + + if (fw_size % 2 != 0) { + fw_size++; + sefw->fw_size_in_items = cpu_to_le16(fw_size); + } + + if (fw_size >= MAX_GENI_CFG_RAMn_CNT) { + dev_err(dev, + "Firmware size (%u) exceeds max allowed RAMn count (%u)\n", + fw_size, MAX_GENI_CFG_RAMn_CNT); + continue; + } + + if (fw_end > phdr->p_filesz || cfg_idx_end > phdr->p_filesz || + cfg_val_end > phdr->p_filesz) { + dev_err(dev, "Truncated or corrupt SE FW segment found at index %d\n", i); + continue; + } + + return sefw; + } + + dev_err(dev, "Failed to get %s protocol firmware\n", protocol_name[protocol]); + return NULL; +} + +/** + * geni_configure_xfer_mode() - Set the transfer mode. + * @se: Pointer to the concerned serial engine. + * @mode: SE data transfer mode. + * + * Set the transfer mode to either FIFO or DMA according to the mode specified + * by the protocol driver. + * + * Return: 0 if successful, otherwise return an error value. + */ +static int geni_configure_xfer_mode(struct geni_se *se, enum geni_se_xfer_mode mode) +{ + /* Configure SE FIFO, DMA or GSI mode. */ + switch (mode) { + case GENI_GPI_DMA: + geni_setbits32(se->base + SE_GENI_DMA_MODE_EN, GENI_DMA_MODE_EN); + writel(0x0, se->base + SE_IRQ_EN); + writel(DMA_RX_EVENT_EN | DMA_TX_EVENT_EN | GENI_M_EVENT_EN | GENI_S_EVENT_EN, + se->base + SE_GSI_EVENT_EN); + break; + + case GENI_SE_FIFO: + geni_clrbits32(se->base + SE_GENI_DMA_MODE_EN, GENI_DMA_MODE_EN); + writel(DMA_RX_IRQ_EN | DMA_TX_IRQ_EN | GENI_M_IRQ_EN | GENI_S_IRQ_EN, + se->base + SE_IRQ_EN); + writel(0x0, se->base + SE_GSI_EVENT_EN); + break; + + case GENI_SE_DMA: + geni_setbits32(se->base + SE_GENI_DMA_MODE_EN, GENI_DMA_MODE_EN); + writel(DMA_RX_IRQ_EN | DMA_TX_IRQ_EN | GENI_M_IRQ_EN | GENI_S_IRQ_EN, + se->base + SE_IRQ_EN); + writel(0x0, se->base + SE_GSI_EVENT_EN); + break; + + default: + dev_err(se->dev, "Invalid geni-se transfer mode: %d\n", mode); + return -EINVAL; + } + return 0; +} + +/** + * geni_enable_interrupts() - Enable interrupts. + * @se: Pointer to the concerned serial engine. + * + * Enable the required interrupts during the firmware load process. + */ +static void geni_enable_interrupts(struct geni_se *se) +{ + u32 val; + + /* Enable required interrupts. */ + writel(M_COMMON_GENI_M_IRQ_EN, se->base + SE_GENI_M_IRQ_EN); + + val = S_CMD_OVERRUN_EN | S_ILLEGAL_CMD_EN | S_CMD_CANCEL_EN | S_CMD_ABORT_EN | + S_GP_IRQ_0_EN | S_GP_IRQ_1_EN | S_GP_IRQ_2_EN | S_GP_IRQ_3_EN | + S_RX_FIFO_WR_ERR_EN | S_RX_FIFO_RD_ERR_EN; + writel(val, se->base + SE_GENI_S_IRQ_ENABLE); + + /* DMA mode configuration. */ + val = GENI_SE_DMA_RESET_DONE_EN | GENI_SE_DMA_AHB_ERR_EN | GENI_SE_DMA_DONE_EN; + writel(val, se->base + SE_DMA_TX_IRQ_EN_SET); + val = GENI_SE_DMA_FLUSH_DONE | GENI_SE_DMA_RESET_DONE_EN | GENI_SE_DMA_AHB_ERR_EN | + GENI_SE_DMA_DONE_EN; + writel(val, se->base + SE_DMA_RX_IRQ_EN_SET); +} + +/** + * geni_write_fw_revision() - Write the firmware revision. + * @se: Pointer to the concerned serial engine. + * @serial_protocol: serial protocol type. + * @fw_version: QUP firmware version. + * + * Write the firmware revision and protocol into the respective register. + */ +static void geni_write_fw_revision(struct geni_se *se, u16 serial_protocol, u16 fw_version) +{ + u32 reg; + + reg = FIELD_PREP(FW_REV_PROTOCOL_MSK, serial_protocol); + reg |= FIELD_PREP(FW_REV_VERSION_MSK, fw_version); + + writel(reg, se->base + SE_GENI_FW_REVISION); + writel(reg, se->base + SE_GENI_S_FW_REVISION); +} + +/** + * geni_load_se_fw() - Load Serial Engine specific firmware. + * @se: Pointer to the concerned serial engine. + * @fw: Pointer to the firmware structure. + * @mode: SE data transfer mode. + * @protocol: Protocol type to be used with the SE (e.g., UART, SPI, I2C). + * + * Load the protocol firmware into the IRAM of the Serial Engine. + * + * Return: 0 if successful, otherwise return an error value. + */ +static int geni_load_se_fw(struct geni_se *se, const struct firmware *fw, + enum geni_se_xfer_mode mode, enum geni_se_protocol_type protocol) +{ + const u32 *fw_data, *cfg_val_arr; + const u8 *cfg_idx_arr; + u32 i, reg_value; + int ret; + struct se_fw_hdr *hdr; + + hdr = geni_find_protocol_fw(se->dev, fw, protocol); + if (!hdr) + return -EINVAL; + + fw_data = (const u32 *)((u8 *)hdr + le16_to_cpu(hdr->fw_offset)); + cfg_idx_arr = (const u8 *)hdr + le16_to_cpu(hdr->cfg_idx_offset); + cfg_val_arr = (const u32 *)((u8 *)hdr + le16_to_cpu(hdr->cfg_val_offset)); + + ret = geni_icc_set_bw(se); + if (ret) + return ret; + + ret = geni_icc_enable(se); + if (ret) + return ret; + + ret = geni_se_resources_on(se); + if (ret) + goto out_icc_disable; + + /* + * Disable high-priority interrupts until all currently executing + * low-priority interrupts have been fully handled. + */ + geni_setbits32(se->wrapper->base + QUPV3_COMMON_CFG, FAST_SWITCH_TO_HIGH_DISABLE); + + /* Set AHB_M_CLK_CGC_ON to indicate hardware controls se-wrapper cgc clock. */ + geni_setbits32(se->wrapper->base + QUPV3_SE_AHB_M_CFG, AHB_M_CLK_CGC_ON); + + /* Let hardware to control common cgc. */ + geni_setbits32(se->wrapper->base + QUPV3_COMMON_CGC_CTRL, COMMON_CSR_SLV_CLK_CGC_ON); + + /* + * Setting individual bits in GENI_OUTPUT_CTRL activates corresponding output lines, + * allowing the hardware to drive data as configured. + */ + writel(0x0, se->base + GENI_OUTPUT_CTRL); + + /* Set SCLK and HCLK to program RAM */ + geni_setbits32(se->base + SE_GENI_CGC_CTRL, PROG_RAM_SCLK_OFF | PROG_RAM_HCLK_OFF); + writel(0x0, se->base + SE_GENI_CLK_CTRL); + geni_clrbits32(se->base + SE_GENI_CGC_CTRL, PROG_RAM_SCLK_OFF | PROG_RAM_HCLK_OFF); + + /* Enable required clocks for DMA CSR, TX and RX. */ + reg_value = AHB_SEC_SLV_CLK_CGC_ON | DMA_AHB_SLV_CLK_CGC_ON | + DMA_TX_CLK_CGC_ON | DMA_RX_CLK_CGC_ON; + geni_setbits32(se->base + SE_DMA_GENERAL_CFG, reg_value); + + /* Let hardware control CGC by default. */ + writel(DEFAULT_CGC_EN, se->base + SE_GENI_CGC_CTRL); + + /* Set version of the configuration register part of firmware. */ + writel(le16_to_cpu(hdr->cfg_version), se->base + SE_GENI_INIT_CFG_REVISION); + writel(le16_to_cpu(hdr->cfg_version), se->base + SE_GENI_S_INIT_CFG_REVISION); + + /* Configure GENI primitive table. */ + for (i = 0; i < le16_to_cpu(hdr->cfg_size_in_items); i++) + writel(cfg_val_arr[i], + se->base + SE_GENI_CFG_REG0 + (cfg_idx_arr[i] * sizeof(u32))); + + /* Configure condition for assertion of RX_RFR_WATERMARK condition. */ + reg_value = geni_se_get_rx_fifo_depth(se); + writel(reg_value - 2, se->base + SE_GENI_RX_RFR_WATERMARK_REG); + + /* Let hardware control CGC */ + geni_setbits32(se->base + GENI_OUTPUT_CTRL, DEFAULT_IO_OUTPUT_CTRL_MSK); + + ret = geni_configure_xfer_mode(se, mode); + if (ret) + goto out_resources_off; + + geni_enable_interrupts(se); + + geni_write_fw_revision(se, le16_to_cpu(hdr->serial_protocol), le16_to_cpu(hdr->fw_version)); + + /* Program RAM address space. */ + memcpy_toio(se->base + SE_GENI_CFG_RAMN, fw_data, + le16_to_cpu(hdr->fw_size_in_items) * sizeof(u32)); + + /* Put default values on GENI's output pads. */ + writel_relaxed(0x1, se->base + GENI_FORCE_DEFAULT_REG); + + /* Toggle SCLK/HCLK from high to low to finalize RAM programming and apply config. */ + geni_setbits32(se->base + SE_GENI_CGC_CTRL, PROG_RAM_SCLK_OFF | PROG_RAM_HCLK_OFF); + geni_setbits32(se->base + SE_GENI_CLK_CTRL, SER_CLK_SEL); + geni_clrbits32(se->base + SE_GENI_CGC_CTRL, PROG_RAM_SCLK_OFF | PROG_RAM_HCLK_OFF); + + /* Serial engine DMA interface is enabled. */ + geni_setbits32(se->base + SE_DMA_IF_EN, DMA_IF_EN); + + /* Enable or disable FIFO interface of the serial engine. */ + if (mode == GENI_SE_FIFO) + geni_clrbits32(se->base + SE_FIFO_IF_DISABLE, FIFO_IF_DISABLE); + else + geni_setbits32(se->base + SE_FIFO_IF_DISABLE, FIFO_IF_DISABLE); + +out_resources_off: + geni_se_resources_off(se); + +out_icc_disable: + geni_icc_disable(se); + return ret; +} + +/** + * geni_load_se_firmware() - Load firmware for SE based on protocol + * @se: Pointer to the concerned serial engine. + * @protocol: Protocol type to be used with the SE (e.g., UART, SPI, I2C). + * + * Retrieves the firmware name from device properties and sets the transfer mode + * (FIFO or GSI DMA) based on device tree configuration. Enforces FIFO mode for + * UART protocol due to lack of GSI DMA support. Requests the firmware and loads + * it into the SE. + * + * Return: 0 on success, negative error code on failure. + */ +int geni_load_se_firmware(struct geni_se *se, enum geni_se_protocol_type protocol) +{ + const char *fw_name; + const struct firmware *fw; + enum geni_se_xfer_mode mode = GENI_SE_FIFO; + int ret; + + if (protocol >= ARRAY_SIZE(protocol_name)) { + dev_err(se->dev, "Invalid geni-se protocol: %d", protocol); + return -EINVAL; + } + + ret = device_property_read_string(se->wrapper->dev, "firmware-name", &fw_name); + if (ret) { + dev_err(se->dev, "Failed to read firmware-name property: %d\n", ret); + return -EINVAL; + } + + if (of_property_read_bool(se->dev->of_node, "qcom,enable-gsi-dma")) + mode = GENI_GPI_DMA; + + /* GSI mode is not supported by the UART driver; therefore, setting FIFO mode */ + if (protocol == GENI_SE_UART) + mode = GENI_SE_FIFO; + + ret = request_firmware(&fw, fw_name, se->dev); + if (ret) { + if (ret == -ENOENT) + return -EPROBE_DEFER; + + dev_err(se->dev, "Failed to request firmware '%s' for protocol %d: ret: %d\n", + fw_name, protocol, ret); + return ret; + } + + ret = geni_load_se_fw(se, fw, mode, protocol); + release_firmware(fw); + + if (ret) { + dev_err(se->dev, "Failed to load SE firmware for protocol %d: ret: %d\n", + protocol, ret); + return ret; + } + + dev_dbg(se->dev, "Firmware load for %s protocol is successful for xfer mode: %d\n", + protocol_name[protocol], mode); + return 0; +} +EXPORT_SYMBOL_GPL(geni_load_se_firmware); + static int geni_se_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; diff --git a/include/linux/soc/qcom/geni-se.h b/include/linux/soc/qcom/geni-se.h index 2996a3c28ef3..0a984e2579fe 100644 --- a/include/linux/soc/qcom/geni-se.h +++ b/include/linux/soc/qcom/geni-se.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved. + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. */ #ifndef _LINUX_QCOM_GENI_SE @@ -36,6 +37,7 @@ enum geni_se_protocol_type { GENI_SE_I2C, GENI_SE_I3C, GENI_SE_SPI_SLAVE, + GENI_SE_INVALID_PROTO = 255, }; struct geni_wrapper; @@ -531,5 +533,7 @@ void geni_icc_set_tag(struct geni_se *se, u32 tag); int geni_icc_enable(struct geni_se *se); int geni_icc_disable(struct geni_se *se); + +int geni_load_se_firmware(struct geni_se *se, enum geni_se_protocol_type protocol); #endif #endif -- cgit v1.2.3 From 09a1b33c080f6ac700fadc67c8471e67bf75fda4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Aug 2025 12:33:11 -0400 Subject: preparations to taking MNT_WRITE_HOLD out of ->mnt_flags We have an unpleasant wart in accessibility rules for struct mount. There are per-superblock lists of mounts, used by sb_prepare_remount_readonly() to check if any of those is currently claimed for write access and to block further attempts to get write access on those until we are done. As soon as it is attached to a filesystem, mount becomes reachable via that list. Only sb_prepare_remount_readonly() traverses it and it only accesses a few members of struct mount. Unfortunately, ->mnt_flags is one of those and it is modified - MNT_WRITE_HOLD set and then cleared. It is done under mount_lock, so from the locking rules POV everything's fine. However, it has easily overlooked implications - once mount has been attached to a filesystem, it has to be treated as globally visible. In particular, initializing ->mnt_flags *must* be done either prior to that point or under mount_lock. All other members are still private at that point. Life gets simpler if we move that bit (and that's *all* that can get touched by access via this list) out of ->mnt_flags. It's not even hard to do - currently the list is implemented as list_head one, anchored in super_block->s_mounts and linked via mount->mnt_instance. As the first step, switch it to hlist-like open-coded structure - address of the first mount in the set is stored in ->s_mounts and ->mnt_instance replaced with ->mnt_next_for_sb and ->mnt_pprev_for_sb - the former either NULL or pointing to the next mount in set, the latter - address of either ->s_mounts or ->mnt_next_for_sb in the previous element of the set. In the next commit we'll steal the LSB of ->mnt_pprev_for_sb as replacement for MNT_WRITE_HOLD. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/mount.h | 4 +++- fs/namespace.c | 38 +++++++++++++++++++++++++++++--------- fs/super.c | 3 +-- include/linux/fs.h | 4 +++- 4 files changed, 36 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/mount.h b/fs/mount.h index 04d0eadc4c10..b208f69f69d7 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -64,7 +64,9 @@ struct mount { #endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ - struct list_head mnt_instance; /* mount instance on sb->s_mounts */ + struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ + struct mount * __aligned(1) *mnt_pprev_for_sb; + /* except that LSB of pprev will be stolen */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ diff --git a/fs/namespace.c b/fs/namespace.c index 4e9ed4edd854..342dfd882b13 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -730,6 +730,27 @@ static inline void mnt_unhold_writers(struct mount *mnt) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } +static inline void mnt_del_instance(struct mount *m) +{ + struct mount **p = m->mnt_pprev_for_sb; + struct mount *next = m->mnt_next_for_sb; + + if (next) + next->mnt_pprev_for_sb = p; + *p = next; +} + +static inline void mnt_add_instance(struct mount *m, struct super_block *s) +{ + struct mount *first = s->s_mounts; + + if (first) + first->mnt_pprev_for_sb = &m->mnt_next_for_sb; + m->mnt_next_for_sb = first; + m->mnt_pprev_for_sb = &s->s_mounts; + s->s_mounts = m; +} + static int mnt_make_readonly(struct mount *mnt) { int ret; @@ -743,7 +764,6 @@ static int mnt_make_readonly(struct mount *mnt) int sb_prepare_remount_readonly(struct super_block *sb) { - struct mount *mnt; int err = 0; /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ @@ -751,9 +771,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) return -EBUSY; lock_mount_hash(); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { - err = mnt_hold_writers(mnt); + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (!(m->mnt.mnt_flags & MNT_READONLY)) { + err = mnt_hold_writers(m); if (err) break; } @@ -763,9 +783,9 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (m->mnt.mnt_flags & MNT_WRITE_HOLD) + m->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } unlock_mount_hash(); @@ -1207,7 +1227,7 @@ static void setup_mnt(struct mount *m, struct dentry *root) m->mnt_parent = m; lock_mount_hash(); - list_add_tail(&m->mnt_instance, &s->s_mounts); + mnt_add_instance(m, s); unlock_mount_hash(); } @@ -1425,7 +1445,7 @@ static void mntput_no_expire(struct mount *mnt) mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); - list_del(&mnt->mnt_instance); + mnt_del_instance(mnt); if (unlikely(!list_empty(&mnt->mnt_expire))) list_del(&mnt->mnt_expire); diff --git a/fs/super.c b/fs/super.c index 7f876f32343a..3b0f49e1b817 100644 --- a/fs/super.c +++ b/fs/super.c @@ -323,7 +323,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, if (!s) return NULL; - INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -408,7 +407,7 @@ static void __put_super(struct super_block *s) list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); - WARN_ON(!list_empty(&s->s_mounts)); + WARN_ON(s->s_mounts); call_rcu(&s->rcu, destroy_super_rcu); } } diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..0e9c7f1460dc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1324,6 +1324,8 @@ struct sb_writers { struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; +struct mount; + struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ @@ -1358,7 +1360,7 @@ struct super_block { __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ - struct list_head s_mounts; /* list of mounts; _not_ for fs use */ + struct mount *s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; struct backing_dev_info *s_bdi; -- cgit v1.2.3 From 3371fa2f27134fc4ec7d40b2ae7b9e92c3b2527e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 27 Aug 2025 13:37:12 -0400 Subject: struct mount: relocate MNT_WRITE_HOLD bit ... from ->mnt_flags to LSB of ->mnt_pprev_for_sb. This is safe - we always set and clear it within the same mount_lock scope, so we won't interfere with list operations - traversals are always forward, so they don't even look at ->mnt_prev_for_sb and both insertions and removals are in mount_lock scopes of their own, so that bit will be clear in *all* mount instances during those. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/mount.h | 25 ++++++++++++++++++++++++- fs/namespace.c | 34 +++++++++++++++++----------------- include/linux/mount.h | 3 +-- 3 files changed, 42 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/mount.h b/fs/mount.h index b208f69f69d7..40cf16544317 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -66,7 +66,8 @@ struct mount { struct list_head mnt_child; /* and going through their mnt_child */ struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ struct mount * __aligned(1) *mnt_pprev_for_sb; - /* except that LSB of pprev will be stolen */ + /* except that LSB of pprev is stolen */ +#define WRITE_HOLD 1 /* ... for use by mnt_hold_writers() */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ @@ -244,4 +245,26 @@ static inline struct mount *topmost_overmount(struct mount *m) return m; } +static inline bool __test_write_hold(struct mount * __aligned(1) *val) +{ + return (unsigned long)val & WRITE_HOLD; +} + +static inline bool test_write_hold(const struct mount *m) +{ + return __test_write_hold(m->mnt_pprev_for_sb); +} + +static inline void set_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + | WRITE_HOLD); +} + +static inline void clear_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + & ~WRITE_HOLD); +} + struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/namespace.c b/fs/namespace.c index 342dfd882b13..714e159ed9cd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -509,20 +509,20 @@ int mnt_get_write_access(struct vfsmount *m) mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass - * MNT_WRITE_HOLD loop below, so that the slowpath can see our - * incremented count after it has set MNT_WRITE_HOLD. + * WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task - * setting MNT_WRITE_HOLD got preempted on a remote + * setting WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting - * MNT_WRITE_HOLD has a lower priority and is bound to + * WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); @@ -533,7 +533,7 @@ int mnt_get_write_access(struct vfsmount *m) } /* * The barrier pairs with the barrier sb_start_ro_state_change() making - * sure that if we see MNT_WRITE_HOLD cleared, we will also see + * sure that if we see WRITE_HOLD cleared, we will also see * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in * mnt_is_readonly() and bail in case we are racing with remount * read-only. @@ -672,15 +672,15 @@ EXPORT_SYMBOL(mnt_drop_write_file); * @mnt. * * Context: This function expects lock_mount_hash() to be held serializing - * setting MNT_WRITE_HOLD. + * setting WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + set_write_hold(mnt); /* - * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * After storing WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); @@ -696,9 +696,9 @@ static inline int mnt_hold_writers(struct mount *mnt) * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. - * MNT_WRITE_HOLD protects against this scenario, because + * WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on - * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) @@ -720,14 +720,14 @@ static inline int mnt_hold_writers(struct mount *mnt) */ static inline void mnt_unhold_writers(struct mount *mnt) { - if (!(mnt->mnt_flags & MNT_WRITE_HOLD)) + if (!test_write_hold(mnt)) return; /* - * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * MNT_READONLY must become visible before ~WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + clear_write_hold(mnt); } static inline void mnt_del_instance(struct mount *m) @@ -766,7 +766,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) { int err = 0; - /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + /* Racy optimization. Recheck the counter under WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; @@ -784,8 +784,8 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { - if (m->mnt.mnt_flags & MNT_WRITE_HOLD) - m->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + if (test_write_hold(m)) + clear_write_hold(m); } unlock_mount_hash(); diff --git a/include/linux/mount.h b/include/linux/mount.h index 18e4b97f8a98..85e97b9340ff 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -33,7 +33,6 @@ enum mount_flags { MNT_NOSYMFOLLOW = 0x80, MNT_SHRINKABLE = 0x100, - MNT_WRITE_HOLD = 0x200, MNT_INTERNAL = 0x4000, @@ -52,7 +51,7 @@ enum mount_flags { | MNT_READONLY | MNT_NOSYMFOLLOW, MNT_ATIME_MASK = MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME, - MNT_INTERNAL_FLAGS = MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | + MNT_INTERNAL_FLAGS = MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_LOCKED }; -- cgit v1.2.3 From a79765248649de77771c24f7be08ff4c96f16f7a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Aug 2025 02:48:13 -0400 Subject: constify {__,}mnt_is_readonly() Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- include/linux/mount.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 54066c9b8da0..4b74fabced43 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -428,7 +428,7 @@ out_free_cache: * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ -bool __mnt_is_readonly(struct vfsmount *mnt) +bool __mnt_is_readonly(const struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } @@ -468,7 +468,7 @@ static unsigned int mnt_get_writers(struct mount *mnt) #endif } -static int mnt_is_readonly(struct vfsmount *mnt) +static int mnt_is_readonly(const struct vfsmount *mnt) { if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; diff --git a/include/linux/mount.h b/include/linux/mount.h index 85e97b9340ff..acfe7ef86a1b 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -76,7 +76,7 @@ extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern void mnt_make_shortterm(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(const struct path *path); -extern bool __mnt_is_readonly(struct vfsmount *mnt); +extern bool __mnt_is_readonly(const struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); -- cgit v1.2.3 From b67a8631a4a8f26a18fac236aaf61aa2412c7a0d Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 13 Sep 2025 23:08:17 +0200 Subject: net: phy: remove mdio_board_info support from phylib After having removed mdio_board_info usage from dsa_loop, there's no user left. So let's drop support for it from phylib. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/01542a2e-05f5-4f13-acef-72632b33b5be@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/Makefile | 2 +- drivers/net/phy/mdio-boardinfo.c | 79 ------------------------------------- drivers/net/phy/mdio-boardinfo.h | 18 --------- drivers/net/phy/mdio_bus_provider.c | 33 ---------------- include/linux/phy.h | 10 ----- 5 files changed, 1 insertion(+), 141 deletions(-) delete mode 100644 drivers/net/phy/mdio-boardinfo.c delete mode 100644 drivers/net/phy/mdio-boardinfo.h (limited to 'include/linux') diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 402a33d559de..76e0db40f879 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -8,7 +8,7 @@ mdio-bus-y += mdio_bus.o mdio_device.o ifdef CONFIG_PHYLIB # built-in whenever PHYLIB is built-in or module -obj-y += stubs.o mdio-boardinfo.o +obj-y += stubs.o endif libphy-$(CONFIG_SWPHY) += swphy.o diff --git a/drivers/net/phy/mdio-boardinfo.c b/drivers/net/phy/mdio-boardinfo.c deleted file mode 100644 index d3184e8f12ec..000000000000 --- a/drivers/net/phy/mdio-boardinfo.c +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * mdio-boardinfo - Collect pre-declarations for MDIO devices - */ - -#include -#include -#include -#include -#include -#include - -#include "mdio-boardinfo.h" - -static LIST_HEAD(mdio_board_list); -static DEFINE_MUTEX(mdio_board_lock); - -struct mdio_board_entry { - struct list_head list; - struct mdio_board_info board_info; -}; - -/** - * mdiobus_setup_mdiodev_from_board_info - create and setup MDIO devices - * from pre-collected board specific MDIO information - * @bus: Bus the board_info belongs to - * @cb: Callback to create device on bus - * Context: can sleep - */ -void mdiobus_setup_mdiodev_from_board_info(struct mii_bus *bus, - int (*cb) - (struct mii_bus *bus, - struct mdio_board_info *bi)) -{ - struct mdio_board_entry *be, *tmp; - - mutex_lock(&mdio_board_lock); - list_for_each_entry_safe(be, tmp, &mdio_board_list, list) { - struct mdio_board_info *bi = &be->board_info; - - if (strcmp(bus->id, bi->bus_id)) - continue; - - mutex_unlock(&mdio_board_lock); - cb(bus, bi); - mutex_lock(&mdio_board_lock); - } - mutex_unlock(&mdio_board_lock); -} -EXPORT_SYMBOL(mdiobus_setup_mdiodev_from_board_info); - -/** - * mdiobus_register_board_info - register MDIO devices for a given board - * @info: array of devices descriptors - * @n: number of descriptors provided - * Context: can sleep - * - * The board info passed can be marked with __initdata but be pointers - * such as platform_data etc. are copied as-is - */ -int mdiobus_register_board_info(const struct mdio_board_info *info, - unsigned int n) -{ - struct mdio_board_entry *be; - - be = kcalloc(n, sizeof(*be), GFP_KERNEL); - if (!be) - return -ENOMEM; - - for (int i = 0; i < n; i++, be++) { - be->board_info = info[i]; - mutex_lock(&mdio_board_lock); - list_add_tail(&be->list, &mdio_board_list); - mutex_unlock(&mdio_board_lock); - } - - return 0; -} -EXPORT_SYMBOL(mdiobus_register_board_info); diff --git a/drivers/net/phy/mdio-boardinfo.h b/drivers/net/phy/mdio-boardinfo.h deleted file mode 100644 index 0878b77878d4..000000000000 --- a/drivers/net/phy/mdio-boardinfo.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * mdio-boardinfo.h - board info interface internal to the mdio_bus - * component - */ - -#ifndef __MDIO_BOARD_INFO_H -#define __MDIO_BOARD_INFO_H - -struct mii_bus; -struct mdio_board_info; - -void mdiobus_setup_mdiodev_from_board_info(struct mii_bus *bus, - int (*cb) - (struct mii_bus *bus, - struct mdio_board_info *bi)); - -#endif /* __MDIO_BOARD_INFO_H */ diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c index f43973e73ea3..a2391d4b7e5c 100644 --- a/drivers/net/phy/mdio_bus_provider.c +++ b/drivers/net/phy/mdio_bus_provider.c @@ -29,8 +29,6 @@ #include #include -#include "mdio-boardinfo.h" - /** * mdiobus_alloc_size - allocate a mii_bus structure * @size: extra amount of memory to allocate for private storage. @@ -132,35 +130,6 @@ static void of_mdiobus_link_mdiodev(struct mii_bus *bus, } #endif -/** - * mdiobus_create_device - create a full MDIO device given - * a mdio_board_info structure - * @bus: MDIO bus to create the devices on - * @bi: mdio_board_info structure describing the devices - * - * Returns 0 on success or < 0 on error. - */ -static int mdiobus_create_device(struct mii_bus *bus, - struct mdio_board_info *bi) -{ - struct mdio_device *mdiodev; - int ret = 0; - - mdiodev = mdio_device_create(bus, bi->mdio_addr); - if (IS_ERR(mdiodev)) - return -ENODEV; - - strscpy(mdiodev->modalias, bi->modalias, - sizeof(mdiodev->modalias)); - mdiodev->dev.platform_data = (void *)bi->platform_data; - - ret = mdio_device_register(mdiodev); - if (ret) - mdio_device_free(mdiodev); - - return ret; -} - static struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr, bool c45) { struct phy_device *phydev = ERR_PTR(-ENODEV); @@ -404,8 +373,6 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) goto error; } - mdiobus_setup_mdiodev_from_board_info(bus, mdiobus_create_device); - bus->state = MDIOBUS_REGISTERED; dev_dbg(&bus->dev, "probed\n"); return 0; diff --git a/include/linux/phy.h b/include/linux/phy.h index 6f3b25cb7f4e..7da9e19471c9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2129,16 +2129,6 @@ int __phy_hwtstamp_set(struct phy_device *phydev, extern const struct bus_type mdio_bus_type; extern const struct class mdio_bus_class; -struct mdio_board_info { - const char *bus_id; - char modalias[MDIO_NAME_SIZE]; - int mdio_addr; - const void *platform_data; -}; - -int mdiobus_register_board_info(const struct mdio_board_info *info, - unsigned int n); - /** * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register -- cgit v1.2.3 From d69ae81efbc95c94a2760fc82d27cdab4c26fe76 Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:14 +0800 Subject: power: supply: core: Add resistance power supply property Some battery drivers provide the ability to export internal resistance as a parameter. Add internal_resistance power supply property for that purpose. Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- Documentation/ABI/testing/sysfs-class-power | 16 ++++++++++++++++ drivers/power/supply/power_supply_sysfs.c | 1 + include/linux/power_supply.h | 1 + 3 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index 87a058e14e7e..e6cce423c632 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -553,6 +553,22 @@ Description: Integer > 0: representing full cycles Integer = 0: cycle_count info is not available +What: /sys/class/power_supply//internal_resistance +Date: August 2025 +Contact: linux-arm-msm@vger.kernel.org +Description: + Represent the battery's internal resistance, often referred + to as Equivalent Series Resistance (ESR). It is a dynamic + parameter that reflects the opposition to current flow within + the cell. It is not a fixed value but varies significantly + based on several operational conditions, including battery + state of charge (SoC), temperature, and whether the battery + is in a charging or discharging state. + + Access: Read + + Valid values: Represented in microohms + **USB Properties** What: /sys/class/power_supply//input_current_limit diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index 18e5e84a81c6..8ba08d456352 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -223,6 +223,7 @@ static struct power_supply_attr power_supply_attrs[] __ro_after_init = { POWER_SUPPLY_ATTR(MANUFACTURE_YEAR), POWER_SUPPLY_ATTR(MANUFACTURE_MONTH), POWER_SUPPLY_ATTR(MANUFACTURE_DAY), + POWER_SUPPLY_ATTR(INTERNAL_RESISTANCE), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(MODEL_NAME), POWER_SUPPLY_ATTR(MANUFACTURER), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index f21f806bfb38..f38da7c039d2 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -176,6 +176,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_MANUFACTURE_YEAR, POWER_SUPPLY_PROP_MANUFACTURE_MONTH, POWER_SUPPLY_PROP_MANUFACTURE_DAY, + POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, /* Properties of type `const char *' */ POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, -- cgit v1.2.3 From cd93fbdce5981c947f22015ded3ac6bd1939b0ad Mon Sep 17 00:00:00 2001 From: Fenglin Wu Date: Wed, 17 Sep 2025 18:15:15 +0800 Subject: power: supply: core: Add state_of_health power supply property Add state_of_health power supply property to represent battery health percentage. Signed-off-by: Fenglin Wu Signed-off-by: Sebastian Reichel --- Documentation/ABI/testing/sysfs-class-power | 21 +++++++++++++++++++++ drivers/power/supply/power_supply_sysfs.c | 1 + include/linux/power_supply.h | 1 + 3 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index e6cce423c632..4b21d5d23251 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -569,6 +569,27 @@ Description: Valid values: Represented in microohms +What: /sys/class/power_supply//state_of_health +Date: August 2025 +Contact: linux-arm-msm@vger.kernel.org +Description: + The state_of_health parameter quantifies the overall condition + of a battery as a percentage, reflecting its ability to deliver + rated performance relative to its original specifications. It is + dynamically computed using a combination of learned capacity + and impedance-based degradation indicators, both of which evolve + over the battery's lifecycle. + Note that the exact algorithms are kept secret by most battery + vendors and the value from different battery vendors cannot be + compared with each other as there is no vendor-agnostic definition + of "performance". Also this usually cannot be used for any + calculations (i.e. this is not the factor between charge_full and + charge_full_design). + + Access: Read + + Valid values: 0 - 100 (percent) + **USB Properties** What: /sys/class/power_supply//input_current_limit diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index 8ba08d456352..198405f7126f 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -224,6 +224,7 @@ static struct power_supply_attr power_supply_attrs[] __ro_after_init = { POWER_SUPPLY_ATTR(MANUFACTURE_MONTH), POWER_SUPPLY_ATTR(MANUFACTURE_DAY), POWER_SUPPLY_ATTR(INTERNAL_RESISTANCE), + POWER_SUPPLY_ATTR(STATE_OF_HEALTH), /* Properties of type `const char *' */ POWER_SUPPLY_ATTR(MODEL_NAME), POWER_SUPPLY_ATTR(MANUFACTURER), diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index f38da7c039d2..360ffdf272da 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -177,6 +177,7 @@ enum power_supply_property { POWER_SUPPLY_PROP_MANUFACTURE_MONTH, POWER_SUPPLY_PROP_MANUFACTURE_DAY, POWER_SUPPLY_PROP_INTERNAL_RESISTANCE, + POWER_SUPPLY_PROP_STATE_OF_HEALTH, /* Properties of type `const char *' */ POWER_SUPPLY_PROP_MODEL_NAME, POWER_SUPPLY_PROP_MANUFACTURER, -- cgit v1.2.3 From 05dfe654b5932322e297aba11dd6f3f26eea6ecb Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:36 +0300 Subject: net/mlx5: Remove unused 'offset' field from mlx5_sq_bfreg The 'offset' field was introduced in the original commit [1] and never used until commit [2], which added an unnecessary use. Remove the field and refactor the write-combining test to use a local variable instead. [1] commit a6d51b68611e ("net/mlx5: Introduce blue flame register allocator") [2] commit d98995b4bf98 ("net/mlx5: Reimplement write combining test") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/wc.c | 12 +++++++----- include/linux/mlx5/driver.h | 1 - 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wc.c b/drivers/net/ethernet/mellanox/mlx5/core/wc.c index 2f0316616fa4..276594586404 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wc.c @@ -255,7 +255,8 @@ static void mlx5_wc_destroy_sq(struct mlx5_wc_sq *sq) mlx5_wq_destroy(&sq->wq_ctrl); } -static void mlx5_wc_post_nop(struct mlx5_wc_sq *sq, bool signaled) +static void mlx5_wc_post_nop(struct mlx5_wc_sq *sq, unsigned int *offset, + bool signaled) { int buf_size = (1 << MLX5_CAP_GEN(sq->cq.mdev, log_bf_reg_size)) / 2; struct mlx5_wqe_ctrl_seg *ctrl; @@ -288,10 +289,10 @@ static void mlx5_wc_post_nop(struct mlx5_wc_sq *sq, bool signaled) */ wmb(); - __iowrite64_copy(sq->bfreg.map + sq->bfreg.offset, mmio_wqe, + __iowrite64_copy(sq->bfreg.map + *offset, mmio_wqe, sizeof(mmio_wqe) / 8); - sq->bfreg.offset ^= buf_size; + *offset ^= buf_size; } static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq) @@ -332,6 +333,7 @@ static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq) static void mlx5_core_test_wc(struct mlx5_core_dev *mdev) { + unsigned int offset = 0; unsigned long expires; struct mlx5_wc_sq *sq; int i, err; @@ -358,9 +360,9 @@ static void mlx5_core_test_wc(struct mlx5_core_dev *mdev) goto err_create_sq; for (i = 0; i < TEST_WC_NUM_WQES - 1; i++) - mlx5_wc_post_nop(sq, false); + mlx5_wc_post_nop(sq, &offset, false); - mlx5_wc_post_nop(sq, true); + mlx5_wc_post_nop(sq, &offset, true); expires = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES; do { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fcfc18bfeba9..5a85b6d91ba3 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -434,7 +434,6 @@ struct mlx5_sq_bfreg { struct mlx5_uars_page *up; bool wc; u32 index; - unsigned int offset; }; struct mlx5_core_health { -- cgit v1.2.3 From aa4595d0ada65d5d44fa924a42a87c175d9d88e3 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:38 +0300 Subject: net/mlx5: Store the global doorbell in mlx5_priv The global doorbell is used for more than just Ethernet resources, so move it out of mlx5e_hw_objs into a common place (mlx5_priv), to avoid non-Ethernet modules (e.g. HWS, ASO) depending on Ethernet structs. Use this opportunity to consolidate it with the 'uar' pointer already there, which was used as an RX doorbell. Underneath the 'uar' pointer is identical to 'bfreg->up', so store a single resource and use that instead. For CQ doorbells, care is taken to always use bfreg->up->index instead of bfreg->index, which may refer to a subsequent UAR page from the same ALLOC_UAR batch on some NICs. This paves the way for cleanly supporting multiple doorbells in the Ethernet driver. Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/infiniband/hw/mlx5/cq.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_common.c | 11 +---------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++----- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/main.c | 11 +++++------ drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c | 8 ++++---- drivers/net/ethernet/mellanox/mlx5/core/wc.c | 4 ++-- include/linux/mlx5/driver.h | 3 +-- 12 files changed, 29 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 9c8003a78334..a23b364e24ff 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -648,7 +648,7 @@ int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; struct mlx5_ib_cq *cq = to_mcq(ibcq); - void __iomem *uar_page = mdev->priv.uar->map; + void __iomem *uar_page = mdev->priv.bfreg.up->map; unsigned long irq_flags; int ret = 0; @@ -923,7 +923,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, cq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); - *index = dev->mdev->priv.uar->index; + *index = dev->mdev->priv.bfreg.up->index; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 1fd403713baf..35039a95dcfd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -145,7 +145,7 @@ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n", cq->cqn); - cq->uar = dev->priv.uar; + cq->uar = dev->priv.bfreg.up; cq->irqn = eq->core.irqn; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 31e7f59bc19b..b6b4ae7c59fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -810,7 +810,7 @@ static void mlx5e_build_common_cq_param(struct mlx5_core_dev *mdev, { void *cqc = param->cqc; - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) MLX5_SET(cqc, cqc, cqe_sz, CQE_STRIDE_128_PAD); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 391b4e9c9dc4..7c1d9a9ea464 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -334,7 +334,7 @@ static int mlx5e_ptp_alloc_txqsq(struct mlx5e_ptp *c, int txq_ix, sq->mdev = mdev; sq->ch_ix = MLX5E_PTP_CHANNEL_IX; sq->txq_ix = txq_ix; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); sq->stats = &c->priv->ptp_stats.sq[tc]; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index 6ed3a32b7e22..e9e36358c39d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -163,17 +163,11 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) goto err_dealloc_transport_domain; } - err = mlx5_alloc_bfreg(mdev, &res->bfreg, false, false); - if (err) { - mlx5_core_err(mdev, "alloc bfreg failed, %d\n", err); - goto err_destroy_mkey; - } - if (create_tises) { err = mlx5e_create_tises(mdev, res->tisn); if (err) { mlx5_core_err(mdev, "alloc tises failed, %d\n", err); - goto err_destroy_bfreg; + goto err_destroy_mkey; } res->tisn_valid = true; } @@ -190,8 +184,6 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) return 0; -err_destroy_bfreg: - mlx5_free_bfreg(mdev, &res->bfreg); err_destroy_mkey: mlx5_core_destroy_mkey(mdev, res->mkey); err_dealloc_transport_domain: @@ -209,7 +201,6 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev) mdev->mlx5e_res.dek_priv = NULL; if (res->tisn_valid) mlx5e_destroy_tises(mdev, res->tisn); - mlx5_free_bfreg(mdev, &res->bfreg); mlx5_core_destroy_mkey(mdev, res->mkey); mlx5_core_dealloc_transport_domain(mdev, res->td.tdn); mlx5_core_dealloc_pd(mdev, res->pdn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index fe5f5ae433b7..22e3bc72a265 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1536,7 +1536,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->pdev = c->pdev; sq->mkey_be = c->mkey_be; sq->channel = c; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu) - ETH_FCS_LEN; sq->xsk_pool = xsk_pool; @@ -1621,7 +1621,7 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c, int err; sq->channel = c; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->reserved_room = param->stop_room; param->wq.db_numa_node = cpu_to_node(c->cpu); @@ -1706,7 +1706,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, sq->priv = c->priv; sq->ch_ix = c->ix; sq->txq_ix = txq_ix; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu); sq->max_sq_mpw_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev); @@ -1782,7 +1782,7 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev, MLX5_SET(sqc, sqc, flush_in_error_en, 1); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); - MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, uar_page, mdev->priv.bfreg.index); MLX5_SET(wq, wq, log_wq_pg_sz, csp->wq_ctrl->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, csp->wq_ctrl->db.dma); @@ -2277,7 +2277,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param) MLX5_SET(cqc, cqc, cq_period_mode, mlx5e_cq_period_mode(param->cq_period_mode)); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index f3c714ebd9cb..25499da177bc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -307,7 +307,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, eqc = MLX5_ADDR_OF(create_eq_in, in, eq_context_entry); MLX5_SET(eqc, eqc, log_eq_size, eq->fbc.log_sz); - MLX5_SET(eqc, eqc, uar_page, priv->uar->index); + MLX5_SET(eqc, eqc, uar_page, priv->bfreg.up->index); MLX5_SET(eqc, eqc, intr, vecidx); MLX5_SET(eqc, eqc, log_page_size, eq->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); @@ -320,7 +320,7 @@ create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, eq->eqn = MLX5_GET(create_eq_out, out, eq_number); eq->irqn = pci_irq_vector(dev->pdev, vecidx); eq->dev = dev; - eq->doorbell = priv->uar->map + MLX5_EQ_DOORBELL_OFFSET; + eq->doorbell = priv->bfreg.up->map + MLX5_EQ_DOORBELL_OFFSET; err = mlx5_debug_eq_add(dev, eq); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c index 58bd749b5e4d..129725159a93 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c @@ -100,7 +100,7 @@ static int create_aso_cq(struct mlx5_aso_cq *cq, void *cqc_data) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -129,7 +129,7 @@ static int mlx5_aso_create_cq(struct mlx5_core_dev *mdev, int numa_node, return -ENOMEM; MLX5_SET(cqc, cqc_data, log_cq_size, 1); - MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.bfreg.up->index); if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) MLX5_SET(cqc, cqc_data, cqe_sz, CQE_STRIDE_128_PAD); @@ -163,7 +163,7 @@ static int mlx5_aso_alloc_sq(struct mlx5_core_dev *mdev, int numa_node, struct mlx5_wq_param param; int err; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; param.db_numa_node = numa_node; param.buf_numa_node = numa_node; @@ -203,7 +203,7 @@ static int create_aso_sq(struct mlx5_core_dev *mdev, int pdn, MLX5_SET(sqc, sqc, ts_format, ts_format); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); - MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, uar_page, mdev->priv.bfreg.index); MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index eb3ac98a2621..6b6d6b05b893 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1316,10 +1316,9 @@ static int mlx5_load(struct mlx5_core_dev *dev) { int err; - dev->priv.uar = mlx5_get_uars_page(dev); - if (IS_ERR(dev->priv.uar)) { - mlx5_core_err(dev, "Failed allocating uar, aborting\n"); - err = PTR_ERR(dev->priv.uar); + err = mlx5_alloc_bfreg(dev, &dev->priv.bfreg, false, false); + if (err) { + mlx5_core_err(dev, "Failed allocating bfreg, %d\n", err); return err; } @@ -1430,7 +1429,7 @@ err_eq_table: err_irq_table: mlx5_pagealloc_stop(dev); mlx5_events_stop(dev); - mlx5_put_uars_page(dev, dev->priv.uar); + mlx5_free_bfreg(dev, &dev->priv.bfreg); return err; } @@ -1455,7 +1454,7 @@ static void mlx5_unload(struct mlx5_core_dev *dev) mlx5_irq_table_destroy(dev); mlx5_pagealloc_stop(dev); mlx5_events_stop(dev); - mlx5_put_uars_page(dev, dev->priv.uar); + mlx5_free_bfreg(dev, &dev->priv.bfreg); } int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index b0595c9b09e4..24ef7d66fa8a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -690,7 +690,7 @@ static int hws_send_ring_alloc_sq(struct mlx5_core_dev *mdev, size_t buf_sz; int err; - sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; + sq->uar_map = mdev->priv.bfreg.map; sq->mdev = mdev; param.db_numa_node = numa_node; @@ -764,7 +764,7 @@ static int hws_send_ring_create_sq(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(sqc, sqc, ts_format, ts_format); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); - MLX5_SET(wq, wq, uar_page, mdev->mlx5e_res.hw_objs.bfreg.index); + MLX5_SET(wq, wq, uar_page, mdev->priv.bfreg.index); MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); @@ -940,7 +940,7 @@ static int hws_send_ring_create_cq(struct mlx5_core_dev *mdev, (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas)); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -963,7 +963,7 @@ static int hws_send_ring_open_cq(struct mlx5_core_dev *mdev, if (!cqc_data) return -ENOMEM; - MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc_data, log_cq_size, ilog2(queue->num_entries)); err = hws_send_ring_alloc_cq(mdev, numa_node, queue, cqc_data, cq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wc.c b/drivers/net/ethernet/mellanox/mlx5/core/wc.c index 276594586404..999d6216648a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/wc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/wc.c @@ -94,7 +94,7 @@ static int create_wc_cq(struct mlx5_wc_cq *cq, void *cqc_data) MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -116,7 +116,7 @@ static int mlx5_wc_create_cq(struct mlx5_core_dev *mdev, struct mlx5_wc_cq *cq) return -ENOMEM; MLX5_SET(cqc, cqc, log_cq_size, TEST_WC_LOG_CQ_SZ); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) MLX5_SET(cqc, cqc, cqe_sz, CQE_STRIDE_128_PAD); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5a85b6d91ba3..15c434fedff7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -612,7 +612,7 @@ struct mlx5_priv { struct mlx5_ft_pool *ft_pool; struct mlx5_bfreg_data bfregs; - struct mlx5_uars_page *uar; + struct mlx5_sq_bfreg bfreg; #ifdef CONFIG_MLX5_SF struct mlx5_vhca_state_notifier *vhca_state_notifier; struct mlx5_sf_dev_table *sf_dev_table; @@ -658,7 +658,6 @@ struct mlx5e_resources { u32 pdn; struct mlx5_td td; u32 mkey; - struct mlx5_sq_bfreg bfreg; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; bool tisn_valid; -- cgit v1.2.3 From a315b723e87ba4e4573e1e5c759d512f38bdc0b3 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:40 +0300 Subject: net/mlx5e: Prepare for using different CQ doorbells Completion queues (CQs) in mlx5 use the same global doorbell, which may become contended when accessed concurrently from many cores. This patch prepares the CQ management code for supporting different doorbells per CQ. This will be used in downstream patches to allow separate doorbells to be used by channels CQs. The main change is moving the 'uar' pointer from struct mlx5_core_cq to struct mlx5e_cq, as the uar page to be used is better off stored directly there. Other users of mlx5_core_cq also store the UAR to be used separately and therefore the pointer being removed is dead weight for them. As evidence, in this patch there are two users which set the mcq.uar pointer but didn't use it, Software Steering and old Innova CQ creation code. Instead, they rang the doorbell directly from another pointer. The 'uar' pointer added to struct mlx5e_cq remains in a hot cacheline (as before), because it may get accessed for each packet. Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 5 +---- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 +++++++--- drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c | 1 - include/linux/mlx5/cq.h | 1 - 7 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index 35039a95dcfd..e9f319a9bdd6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -145,7 +145,6 @@ int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n", cq->cqn); - cq->uar = dev->priv.bfreg.up; cq->irqn = eq->core.irqn; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 9c73165653bf..1cbe3f3037bb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -344,6 +344,7 @@ struct mlx5e_cq { /* data path - accessed per napi poll */ u16 event_ctr; struct napi_struct *napi; + struct mlx5_uars_page *uar; struct mlx5_core_cq mcq; struct mlx5e_ch_stats *ch_stats; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 5dc04bbfc71b..6760bb0336df 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -309,10 +309,7 @@ mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, static inline void mlx5e_cq_arm(struct mlx5e_cq *cq) { - struct mlx5_core_cq *mcq; - - mcq = &cq->mcq; - mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc); + mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, cq->uar->map, cq->wq.cc); } static inline struct mlx5e_sq_dma * diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c1491ed3db1c..93505c82d6f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2189,6 +2189,7 @@ static void mlx5e_close_xdpredirect_sq(struct mlx5e_xdpsq *xdpsq) static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, struct net_device *netdev, struct workqueue_struct *workqueue, + struct mlx5_uars_page *uar, struct mlx5e_cq_param *param, struct mlx5e_cq *cq) { @@ -2220,6 +2221,7 @@ static int mlx5e_alloc_cq_common(struct mlx5_core_dev *mdev, cq->mdev = mdev; cq->netdev = netdev; cq->workqueue = workqueue; + cq->uar = uar; return 0; } @@ -2235,7 +2237,8 @@ static int mlx5e_alloc_cq(struct mlx5_core_dev *mdev, param->wq.db_numa_node = ccp->node; param->eq_ix = ccp->ix; - err = mlx5e_alloc_cq_common(mdev, ccp->netdev, ccp->wq, param, cq); + err = mlx5e_alloc_cq_common(mdev, ccp->netdev, ccp->wq, + mdev->priv.bfreg.up, param, cq); cq->napi = ccp->napi; cq->ch_stats = ccp->ch_stats; @@ -2280,7 +2283,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param) MLX5_SET(cqc, cqc, cq_period_mode, mlx5e_cq_period_mode(param->cq_period_mode)); MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); - MLX5_SET(cqc, cqc, uar_page, mdev->priv.bfreg.up->index); + MLX5_SET(cqc, cqc, uar_page, cq->uar->index); MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); @@ -3593,7 +3596,8 @@ static int mlx5e_alloc_drop_cq(struct mlx5e_priv *priv, param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); param->wq.db_numa_node = dev_to_node(mlx5_core_dma_dev(mdev)); - return mlx5e_alloc_cq_common(priv->mdev, priv->netdev, priv->wq, param, cq); + return mlx5e_alloc_cq_common(priv->mdev, priv->netdev, priv->wq, + mdev->priv.bfreg.up, param, cq); } int mlx5e_open_drop_rq(struct mlx5e_priv *priv, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index c4de6bf8d1b6..cb1319974f83 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -475,7 +475,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) *conn->cq.mcq.arm_db = 0; conn->cq.mcq.vector = 0; conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; - conn->cq.mcq.uar = fdev->conn_res.uar; tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet); mlx5_fpga_dbg(fdev, "Created CQ #0x%x\n", conn->cq.mcq.cqn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c index 4fd4e8483382..077a77fde670 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_send.c @@ -1131,7 +1131,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, *cq->mcq.arm_db = cpu_to_be32(2 << 28); cq->mcq.vector = 0; - cq->mcq.uar = uar; cq->mdev = mdev; return cq; diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 991526039ccb..7ef2c7c7d803 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -41,7 +41,6 @@ struct mlx5_core_cq { int cqe_sz; __be32 *set_ci_db; __be32 *arm_db; - struct mlx5_uars_page *uar; refcount_t refcount; struct completion free; unsigned vector; -- cgit v1.2.3 From 71fb4832d50b01f0af2d257360c239879ce93a8e Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Tue, 16 Sep 2025 17:11:41 +0300 Subject: net/mlx5e: Use multiple TX doorbells First, allocate more doorbells in mlx5e_create_mdev_resources: - one doorbell remains 'global' and will be used by all non-channel associated SQs (e.g. ASO, HWS, PTP, ...). - allocate additional 'num_doorbells' doorbells. This defaults to minimum between 8 and max number of channels. mlx5e_channel_pick_doorbell() now spreads out channel SQs across available doorbells. Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_common.c | 29 +++++++++++++++++++++- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 +++++++- include/linux/mlx5/driver.h | 4 +++ 3 files changed, 42 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c index e9e36358c39d..d13cebbc763a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c @@ -143,6 +143,7 @@ err_close_tises: int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) { struct mlx5e_hw_objs *res = &mdev->mlx5e_res.hw_objs; + unsigned int num_doorbells, i; int err; err = mlx5_core_alloc_pd(mdev, &res->pdn); @@ -163,11 +164,30 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) goto err_dealloc_transport_domain; } + num_doorbells = min(MLX5_DEFAULT_NUM_DOORBELLS, + mlx5e_get_max_num_channels(mdev)); + res->bfregs = kcalloc(num_doorbells, sizeof(*res->bfregs), GFP_KERNEL); + if (!res->bfregs) { + err = -ENOMEM; + goto err_destroy_mkey; + } + + for (i = 0; i < num_doorbells; i++) { + err = mlx5_alloc_bfreg(mdev, res->bfregs + i, false, false); + if (err) { + mlx5_core_warn(mdev, + "could only allocate %d/%d doorbells, err %d.\n", + i, num_doorbells, err); + break; + } + } + res->num_bfregs = i; + if (create_tises) { err = mlx5e_create_tises(mdev, res->tisn); if (err) { mlx5_core_err(mdev, "alloc tises failed, %d\n", err); - goto err_destroy_mkey; + goto err_destroy_bfregs; } res->tisn_valid = true; } @@ -184,6 +204,10 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev, bool create_tises) return 0; +err_destroy_bfregs: + for (i = 0; i < res->num_bfregs; i++) + mlx5_free_bfreg(mdev, res->bfregs + i); + kfree(res->bfregs); err_destroy_mkey: mlx5_core_destroy_mkey(mdev, res->mkey); err_dealloc_transport_domain: @@ -201,6 +225,9 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev) mdev->mlx5e_res.dek_priv = NULL; if (res->tisn_valid) mlx5e_destroy_tises(mdev, res->tisn); + for (unsigned int i = 0; i < res->num_bfregs; i++) + mlx5_free_bfreg(mdev, res->bfregs + i); + kfree(res->bfregs); mlx5_core_destroy_mkey(mdev, res->mkey); mlx5_core_dealloc_transport_domain(mdev, res->td.tdn); mlx5_core_dealloc_pd(mdev, res->pdn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 93505c82d6f0..0700656b3c3e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2752,7 +2752,16 @@ void mlx5e_trigger_napi_sched(struct napi_struct *napi) static void mlx5e_channel_pick_doorbell(struct mlx5e_channel *c) { - c->bfreg = &c->mdev->priv.bfreg; + struct mlx5e_hw_objs *hw_objs = &c->mdev->mlx5e_res.hw_objs; + + /* No dedicated Ethernet doorbells, use the global one. */ + if (hw_objs->num_bfregs == 0) { + c->bfreg = &c->mdev->priv.bfreg; + return; + } + + /* Round-robin between doorbells. */ + c->bfreg = hw_objs->bfregs + c->vec_ix % hw_objs->num_bfregs; } static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 15c434fedff7..99b34e4809ae 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -658,6 +658,8 @@ struct mlx5e_resources { u32 pdn; struct mlx5_td td; u32 mkey; + struct mlx5_sq_bfreg *bfregs; + unsigned int num_bfregs; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; bool tisn_valid; @@ -801,6 +803,8 @@ struct mlx5_db { int index; }; +#define MLX5_DEFAULT_NUM_DOORBELLS 8 + enum { MLX5_COMP_EQ_SIZE = 1024, }; -- cgit v1.2.3 From 542a495cbaa6dc57a310da62b501fdf318657cad Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:25 +0200 Subject: tcp: AccECN core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change implements Accurate ECN without negotiation and AccECN Option (that will be added by later changes). Based on AccECN specifications: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN allows feeding back the number of CE (congestion experienced) marks accurately to the sender in contrast to RFC3168 ECN that can only signal one marks-seen-yes/no per RTT. Congestion control algorithms can take advantage of the accurate ECN information to fine-tune their congestion response to avoid drastic rate reduction when only mild congestion is encountered. With Accurate ECN, tp->received_ce (r.cep in AccECN spec) keeps track of how many segments have arrived with a CE mark. Accurate ECN uses ACE field (ECE, CWR, AE) to communicate the value back to the sender which updates tp->delivered_ce (s.cep) based on the feedback. This signalling channel is lossy when ACE field overflow occurs. Conservative strategy is selected here to deal with the ACE overflow, however, some strategies using the AccECN option later in the overall patchset mitigate against false overflows detected. The ACE field values on the wire are offset by TCP_ACCECN_CEP_INIT_OFFSET. Delivered_ce/received_ce count the real CE marks rather than forcing all downstream users to adapt to the wire offset. This patch uses the first 1-byte hole and the last 4-byte hole of the tcp_sock_write_txrx for 'received_ce_pending' and 'received_ce'. Also, the group size of tcp_sock_write_txrx is increased from 91 + 4 to 95 + 4 due to the new u32 received_ce member. Below are the trimmed pahole outcomes before and after this patch. [BEFORE THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* XXX 2 bytes hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 app_limited; /* 2580 4 */ u32 rcv_wnd; /* 2684 4 */ struct tcp_options_received rx_opt; /* 2688 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 161 */ } [AFTER THIS PATCH] struct tcp_sock { [...] __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 164 */ } Signed-off-by: Ilpo Järvinen Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-2-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- .../networking/net_cachelines/tcp_sock.rst | 2 + include/linux/tcp.h | 3 + include/net/tcp.h | 15 ++++ include/net/tcp_ecn.h | 53 ++++++++++- net/ipv4/tcp.c | 5 +- net/ipv4/tcp_input.c | 100 +++++++++++++++++++-- net/ipv4/tcp_output.c | 9 +- 7 files changed, 175 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 7bbda5944ee2..31313a9adccc 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -101,6 +101,8 @@ u32 prr_delivered u32 prr_out read_mostly read_mostly tcp_rate_skb_sent,tcp_newly_delivered(tx);tcp_ack,tcp_rate_gen,tcp_clean_rtx_queue(rx) u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) +u32 received_ce read_mostly read_write +u8:4 received_ce_pending read_mostly read_write u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u64 first_tx_mstamp read_write tcp_rate_skb_sent diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d103cc0e7a35..90cee6e53527 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -287,6 +287,8 @@ struct tcp_sock { */ u8 nonagle : 4,/* Disable Nagle algorithm? */ rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ + u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ + unused2:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -299,6 +301,7 @@ struct tcp_sock { u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ + u32 received_ce; /* Like the above but for rcvd CE marked pkts */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp.h b/include/net/tcp.h index e25340459ce4..bc5159fe842e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -973,6 +973,14 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCP_ACCECN_CEP_ACE_MASK 0x7 +#define TCP_ACCECN_ACE_MAX_DELTA 6 + +/* To avoid/detect middlebox interference, not all counters start at 0. + * See draft-ietf-tcpm-accurate-ecn for the latest values. + */ +#define TCP_ACCECN_CEP_INIT_OFFSET 5 + /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ @@ -1782,11 +1790,18 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { + u32 ace; + /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return; + ace = tcp_ecn_mode_accecn(tp) ? + ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) & + TCP_ACCECN_CEP_ACE_MASK) : 0; + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + (ace << 22) | ntohl(TCP_FLAG_ACK) | snd_wnd); } diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index b3430557676b..b0ed89dbad41 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -12,6 +12,7 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { + /* Do not set CWR if in AccECN mode! */ if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -19,8 +20,10 @@ static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) static inline void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) { - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; /* If the sender is telling us it has entered CWR, then its * cwnd may be very low (even just 1 packet), so we should ACK @@ -36,6 +39,52 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } +static inline u8 tcp_accecn_ace(const struct tcphdr *th) +{ + return (th->ae << 2) | (th->cwr << 1) | th->ece; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; + tp->received_ce_pending = 0; +} + +/* Updates Accurate ECN received counters from the received IP ECN field */ +static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) +{ + u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + u8 is_ce = INET_ECN_is_ce(ecnfield); + struct tcp_sock *tp = tcp_sk(sk); + + if (!INET_ECN_is_not_ect(ecnfield)) { + u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); + + /* As for accurate ECN, the TCP_ECN_SEEN flag is set by + * tcp_ecn_received_counters() when the ECN codepoint of + * received TCP data or ACK contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_SEEN; + + /* ACE counter tracks *all* segments including pure ACKs */ + tp->received_ce += pcount; + tp->received_ce_pending = min(tp->received_ce_pending + pcount, + 0xfU); + } +} + +static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) +{ + u32 wire_ace; + + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; +} + static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1f643faa8b93..16456c10e5e8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,6 +271,7 @@ #include #include #include +#include #include #include #include @@ -3406,6 +3407,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tcp_accecn_init_counters(tp); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5138,6 +5140,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5145,7 +5148,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 91 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b2793e749cfd..98782134c2f4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -360,16 +360,25 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && + tcp_ecn_mode_rfc3168(tp)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } + /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by + * tcp_data_ecn_check() when the ECN codepoint of + * received TCP data contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; } @@ -385,10 +394,64 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; - if (ece_ack) + if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } +/* Returns the ECN CE delta */ +static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, int flag) +{ + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + u32 delta, safe_delta; + u32 corrected_ace; + + /* Reordered ACK or uncertain due to lack of data to send and ts */ + if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) + return 0; + + if (!(flag & FLAG_SLOWPATH)) { + /* AccECN counter might overflow on large ACKs */ + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return 0; + } + + /* ACE field is not available during handshake */ + if (flag & FLAG_SYN_ACKED) + return 0; + + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + + corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; + delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return delta; + + safe_delta = delivered_pkts - + ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + + return safe_delta; +} + +static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 delta; + + delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + if (delta > 0) { + tcp_count_delivered_ce(tp, delta); + *flag |= FLAG_ECE; + /* Recalculate header predictor */ + if (tp->pred_flags) + tcp_fast_path_on(tp); + } + return delta; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -3744,7 +3807,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) } /* Returns the number of packets newly acked or sacked by the current ACK */ -static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, + u32 ecn_count, int flag) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3752,8 +3816,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) - NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + + if (flag & FLAG_ECE) { + if (tcp_ecn_mode_rfc3168(tp)) + ecn_count = delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count); + } return delivered; } @@ -3774,6 +3842,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 delivered = tp->delivered; u32 lost = tp->lost; int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */ u32 prior_fack; sack_state.first_sackt = 0; @@ -3881,6 +3950,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + &flag); + tcp_in_ack_event(sk, flag); if (tp->tlp_high_seq) @@ -3905,7 +3979,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tcp_newly_delivered(sk, delivered, flag); + delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag); + lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3914,12 +3989,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3940,7 +4019,7 @@ old_ack: &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); tcp_xmit_recovery(sk, rexmit); } @@ -6071,6 +6150,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); + tcp_ecn_received_counters(sk, skb); + /* We know that such packets are checksummed * on entry. */ @@ -6119,6 +6200,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); + tcp_ecn_received_counters(sk, skb); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6159,6 +6241,8 @@ validate: return; step5: + tcp_ecn_received_counters(sk, skb); + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { reason = -reason; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index be8ceefa5332..a3a6d3e91d84 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -328,7 +328,14 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_ecn_mode_rfc3168(tp)) { + if (!tcp_ecn_mode_any(tp)) + return; + + if (tcp_ecn_mode_accecn(tp)) { + INET_ECN_xmit(sk); + tcp_accecn_set_ace(th, tp); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; + } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { -- cgit v1.2.3 From 3cae34274c79e0c60ccd1c10516973af1aed2a7c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:26 +0200 Subject: tcp: accecn: AccECN negotiation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accurate ECN negotiation parts based on the specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt Accurate ECN is negotiated using ECE, CWR and AE flags in the TCP header. TCP falls back into using RFC3168 ECN if one of the ends supports only RFC3168-style ECN. The AccECN negotiation includes reflecting IP ECN field value seen in SYN and SYNACK back using the same bits as negotiation to allow responding to SYN CE marks and to detect ECN field mangling. CE marks should not occur currently because SYN=1 segments are sent with Non-ECT in IP ECN field (but proposal exists to remove this restriction). Reflecting SYN IP ECN field in SYNACK is relatively simple. Reflecting SYNACK IP ECN field in the final/third ACK of the handshake is more challenging. Linux TCP code is not well prepared for using the final/third ACK a signalling channel which makes things somewhat complicated here. tcp_ecn sysctl can be used to select the highest ECN variant (Accurate ECN, ECN, No ECN) that is attemped to be negotiated and requested for incoming connection and outgoing connection: TCP_ECN_IN_NOECN_OUT_NOECN, TCP_ECN_IN_ECN_OUT_ECN, TCP_ECN_IN_ECN_OUT_NOECN, TCP_ECN_IN_ACCECN_OUT_ACCECN, TCP_ECN_IN_ACCECN_OUT_ECN, and TCP_ECN_IN_ACCECN_OUT_NOECN. After this patch, the size of tcp_request_sock remains unchanged and no new holes are added. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_request_sock { [...] u32 rcv_nxt; /* 352 4 */ u8 syn_tos; /* 356 1 */ /* size: 360, cachelines: 6, members: 16 */ } [AFTER THIS PATCH] struct tcp_request_sock { [...] u32 rcv_nxt; /* 352 4 */ u8 syn_tos; /* 356 1 */ bool accecn_ok; /* 357 1 */ u8 syn_ect_snt:2; /* 358: 0 1 */ u8 syn_ect_rcv:2; /* 358: 2 1 */ u8 accecn_fail_mode:4; /* 358: 4 1 */ /* size: 360, cachelines: 6, members: 20 */ } After this patch, the size of tcp_sock remains unchanged and no new holes are added. Also, 4 bits of the existing 2-byte hole are exploited. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 dup_ack_counter:2; /* 2761: 0 1 */ u8 tlp_retrans:1; /* 2761: 2 1 */ u8 unused:5; /* 2761: 3 1 */ u8 thin_lto:1; /* 2762: 0 1 */ u8 fastopen_connect:1; /* 2762: 1 1 */ u8 fastopen_no_cookie:1; /* 2762: 2 1 */ u8 fastopen_client_fail:2; /* 2762: 3 1 */ u8 frto:1; /* 2762: 5 1 */ /* XXX 2 bits hole, try to pack */ [...] u8 keepalive_probes; /* 2765 1 */ /* XXX 2 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 164 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 dup_ack_counter:2; /* 2761: 0 1 */ u8 tlp_retrans:1; /* 2761: 2 1 */ u8 syn_ect_snt:2; /* 2761: 3 1 */ u8 syn_ect_rcv:2; /* 2761: 5 1 */ u8 thin_lto:1; /* 2761: 7 1 */ u8 fastopen_connect:1; /* 2762: 0 1 */ u8 fastopen_no_cookie:1; /* 2762: 1 1 */ u8 fastopen_client_fail:2; /* 2762: 2 1 */ u8 frto:1; /* 2762: 4 1 */ /* XXX 3 bits hole, try to pack */ [...] u8 keepalive_probes; /* 2765 1 */ u8 accecn_fail_mode:4; /* 2766: 0 1 */ /* XXX 4 bits hole, try to pack */ /* XXX 1 byte hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 166 */ } Signed-off-by: Ilpo Järvinen Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Acked-by: Paolo Abeni Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-3-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 36 ++- .../networking/net_cachelines/tcp_sock.rst | 3 + include/linux/tcp.h | 8 +- include/net/tcp.h | 1 + include/net/tcp_ecn.h | 310 +++++++++++++++++++-- net/ipv4/syncookies.c | 4 + net/ipv4/sysctl_net_ipv4.c | 3 +- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 50 +++- net/ipv4/tcp_ipv4.c | 6 +- net/ipv4/tcp_minisocks.c | 24 +- net/ipv4/tcp_output.c | 10 +- net/ipv6/syncookies.c | 2 + net/ipv6/tcp_ipv6.c | 1 + 14 files changed, 400 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 9f5891c9b07b..3d8eb54b84f9 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -443,20 +443,28 @@ tcp_early_retrans - INTEGER tcp_ecn - INTEGER Control use of Explicit Congestion Notification (ECN) by TCP. - ECN is used only when both ends of the TCP connection indicate - support for it. This feature is useful in avoiding losses due - to congestion by allowing supporting routers to signal - congestion before having to drop packets. - - Possible values are: - - = ===================================================== - 0 Disable ECN. Neither initiate nor accept ECN. - 1 Enable ECN when requested by incoming connections and - also request ECN on outgoing connection attempts. - 2 Enable ECN when requested by incoming connections - but do not request ECN on outgoing connections. - = ===================================================== + ECN is used only when both ends of the TCP connection indicate support + for it. This feature is useful in avoiding losses due to congestion by + allowing supporting routers to signal congestion before having to drop + packets. A host that supports ECN both sends ECN at the IP layer and + feeds back ECN at the TCP layer. The highest variant of ECN feedback + that both peers support is chosen by the ECN negotiation (Accurate ECN, + ECN, or no ECN). + + The highest negotiated variant for incoming connection requests + and the highest variant requested by outgoing connection + attempts: + + ===== ==================== ==================== + Value Incoming connections Outgoing connections + ===== ==================== ==================== + 0 No ECN No ECN + 1 ECN ECN + 2 ECN No ECN + 3 AccECN AccECN + 4 AccECN ECN + 5 AccECN No ECN + ===== ==================== ==================== Default: 2 diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 31313a9adccc..4f71ece7c655 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -103,6 +103,9 @@ u32 delivered read_mostly read_w u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u8:2 syn_ect_snt write_mostly read_write +u8:2 syn_ect_rcv read_mostly read_write +u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u64 first_tx_mstamp read_write tcp_rate_skb_sent diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 90cee6e53527..b8432bed546d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -168,6 +168,10 @@ struct tcp_request_sock { * after data-in-SYN. */ u8 syn_tos; + bool accecn_ok; + u8 syn_ect_snt: 2, + syn_ect_rcv: 2, + accecn_fail_mode:4; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -375,7 +379,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + syn_ect_snt:2, /* AccECN ECT memory, only */ + syn_ect_rcv:2; /* ... needed during 3WHS + first seqno */ u8 thin_lto : 1,/* Use linear timeouts for thin streams */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ @@ -391,6 +396,7 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ + u8 accecn_fail_mode:4; /* AccECN failure handling */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ diff --git a/include/net/tcp.h b/include/net/tcp.h index bc5159fe842e..da8c6640ead3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -972,6 +972,7 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR) #define TCP_ACCECN_CEP_ACE_MASK 0x7 #define TCP_ACCECN_ACE_MAX_DELTA 6 diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index b0ed89dbad41..da0b355418bd 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -4,12 +4,26 @@ #include #include +#include #include #include #include #include +/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is + * attemped to be negotiated and requested for incoming connection + * and outgoing connection, respectively. + */ +enum tcp_ecn_mode { + TCP_ECN_IN_NOECN_OUT_NOECN = 0, + TCP_ECN_IN_ECN_OUT_ECN = 1, + TCP_ECN_IN_ECN_OUT_NOECN = 2, + TCP_ECN_IN_ACCECN_OUT_ACCECN = 3, + TCP_ECN_IN_ACCECN_OUT_ECN = 4, + TCP_ECN_IN_ACCECN_OUT_NOECN = 5, +}; + static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ @@ -39,19 +53,125 @@ static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } +/* tp->accecn_fail_mode */ +#define TCP_ACCECN_ACE_FAIL_SEND BIT(0) +#define TCP_ACCECN_ACE_FAIL_RECV BIT(1) +#define TCP_ACCECN_OPT_FAIL_SEND BIT(2) +#define TCP_ACCECN_OPT_FAIL_RECV BIT(3) + +static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; +} + +static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV; +} + +static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND; +} + +static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV; +} + +static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) +{ + tp->accecn_fail_mode |= mode; +} + static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; } -static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +/* Infer the ECT value our SYN arrived with from the echoed ACE field */ +static inline int tcp_accecn_extract_syn_ect(u8 ace) { - tp->received_ce = 0; - tp->received_ce_pending = 0; + /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */ + static const int ace_to_ecn[8] = { + INET_ECN_ECT_0, /* 0b000 (Undefined) */ + INET_ECN_ECT_1, /* 0b001 (Undefined) */ + INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */ + INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */ + INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */ + INET_ECN_ECT_1, /* 0b101 (Reserved) */ + INET_ECN_CE, /* 0b110 (CE is received) */ + INET_ECN_ECT_1 /* 0b111 (Undefined) */ + }; + + return ace_to_ecn[ace & 0x7]; +} + +/* Check ECN field transition to detect invalid transitions */ +static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv) +{ + if (rcv == snt) + return true; + + /* Non-ECT altered to something or something became non-ECT */ + if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT) + return false; + /* CE -> ECT(0/1)? */ + if (snt == INET_ECN_CE) + return false; + return true; +} + +static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, + u8 sent_ect) +{ + u8 ect = tcp_accecn_extract_syn_ect(ace); + struct tcp_sock *tp = tcp_sk(sk); + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + return true; + + if (!tcp_ect_transition_valid(sent_ect, ect)) { + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + return false; + } + + return true; +} + +/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ +static inline void tcp_accecn_third_ack(struct sock *sk, + const struct sk_buff *skb, u8 sent_ect) +{ + u8 ace = tcp_accecn_ace(tcp_hdr(skb)); + struct tcp_sock *tp = tcp_sk(sk); + + switch (ace) { + case 0x0: + /* Invalid value */ + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + break; + case 0x7: + case 0x5: + case 0x1: + /* Unused but legal values */ + break; + default: + /* Validation only applies to first non-data packet */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && + !TCP_SKB_CB(skb)->sacked && + tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { + if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && + !tp->delivered_ce) + tp->delivered_ce++; + } + break; + } } /* Updates Accurate ECN received counters from the received IP ECN field */ -static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) +static inline void tcp_ecn_received_counters(struct sock *sk, + const struct sk_buff *skb) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); @@ -74,27 +194,152 @@ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_bu } } -static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) +/* AccECN specification, 5.1: [...] a server can determine that it + * negotiated AccECN as [...] if the ACK contains an ACE field with + * the value 0b010 to 0b111 (decimal 2 to 7). + */ +static inline bool cookie_accecn_ok(const struct tcphdr *th) { - u32 wire_ace; + return tcp_accecn_ace(th) > 0x1; +} + +/* Used to form the ACE flags for SYN/ACK */ +static inline u16 tcp_accecn_reflector_flags(u8 ect) +{ + /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN. + * Below is an excerpt from the 1st block of Table 2 of AccECN spec, + * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE + */ + static const u8 ecn_to_ace_flags[4] = { + 0b010, /* Not-ECT is received */ + 0b011, /* ECT(1) is received */ + 0b100, /* ECT(0) is received */ + 0b110 /* CE is received */ + }; + + return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]); +} - wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; - th->ece = !!(wire_ace & 0x1); - th->cwr = !!(wire_ace & 0x2); - th->ae = !!(wire_ace & 0x4); +/* AccECN specification, 3.1.2: If a TCP server that implements AccECN + * receives a SYN with the three TCP header flags (AE, CWR and ECE) set + * to any combination other than 000, 011 or 111, it MUST negotiate the + * use of AccECN as if they had been set to 111. + */ +static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) +{ + u8 ace = tcp_accecn_ace(th); + + return ace && ace != 0x3; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; tp->received_ce_pending = 0; } -static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, - const struct tcphdr *th) +/* Used for make_synack to form the ACE flags */ +static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) { - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received + * from SYN. Below is an excerpt from Table 2 of the AccECN spec: + * +====================+====================================+ + * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK | + * | received on SYN | AE CWR ECE | + * +====================+====================================+ + * | Not-ECT | 0 1 0 | + * | ECT(1) | 0 1 1 | + * | ECT(0) | 1 0 0 | + * | CE | 1 1 0 | + * +====================+====================================+ + */ + th->ae = !!(ect & INET_ECN_ECT_0); + th->cwr = ect != INET_ECN_ECT_0; + th->ece = ect == INET_ECN_ECT_1; +} + +static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, + struct tcphdr *th) +{ + u32 wire_ace; + + /* The final packet of the 3WHS or anything like it must reflect + * the SYN/ACK ECT instead of putting CEP into ACE field, such + * case show up in tcp_flags. + */ + if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) { + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; + } +} + +/* See Table 2 of the AccECN draft */ +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, + u8 ip_dsfield) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 ace = tcp_accecn_ace(th); + + switch (ace) { + case 0x0: + case 0x7: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | No ECN | 0 0 0 | Not ECN | + * | AccECN | Broken | 1 1 1 | Not ECN | + * +========+========+============+=============+ + */ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + break; + case 0x1: + case 0x5: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | Nonce | 1 0 1 | (Reserved) | + * | AccECN | ECN | 0 0 1 | Classic ECN | + * | Nonce | AccECN | 0 0 1 | Classic ECN | + * | ECN | AccECN | 0 0 1 | Classic ECN | + * +========+========+============+=============+ + */ + if (tcp_ecn_mode_pending(tp)) + /* Downgrade from AccECN, or requested initially */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + break; + default: + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + if (INET_ECN_is_ce(ip_dsfield) && + tcp_accecn_validate_syn_feedback(sk, ace, + tp->syn_ect_snt)) { + tp->received_ce++; + tp->received_ce_pending++; + } + break; + } } -static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, - const struct tcphdr *th) +static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, + const struct sk_buff *skb) { + if (tcp_ecn_mode_pending(tp)) { + if (!tcp_accecn_syn_requested(th)) { + /* Downgrade to classic ECN feedback */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } else { + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + } + } if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } @@ -110,7 +355,7 @@ static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, /* Packet ECN state for a SYN-ACK */ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) { - const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; if (tcp_ecn_disabled(tp)) @@ -118,6 +363,13 @@ static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) INET_ECN_xmit(sk); + + if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + TCP_SKB_CB(skb)->tcp_flags |= + tcp_accecn_reflector_flags(tp->syn_ect_rcv); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } } /* Packet ECN state for a SYN. */ @@ -125,8 +377,13 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; + bool use_ecn, use_accecn; + u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn); + + use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN; + use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || + tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; if (!use_ecn) { const struct dst_entry *dst = __sk_dst_get(sk); @@ -142,23 +399,32 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) INET_ECN_xmit(sk); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + if (use_accecn) { + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } else { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } } } static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) { - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) { /* tp->ecn_flags are cleared at a later point in time when * SYN ACK is ultimatively being received. */ - TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + } } static inline void tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) { - if (inet_rsk(req)->ecn_ok) + if (tcp_rsk(req)->accecn_ok) + tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); + else if (inet_rsk(req)->ecn_ok) th->ece = 1; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eb0819463fae..569befcf021b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -12,6 +12,7 @@ #include #include #include +#include #include static siphash_aligned_key_t syncookie_secret[2]; @@ -403,6 +404,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); + struct tcp_request_sock *treq; struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4; @@ -428,6 +430,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } ireq = inet_rsk(req); + treq = tcp_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); @@ -483,6 +486,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); + treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3a43010d726f..268f8b86e8a7 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,6 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; +static int tcp_ecn_mode_max = 2; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -728,7 +729,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, + .extra2 = &tcp_ecn_mode_max, }, { .procname = "tcp_ecn_fallback", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 16456c10e5e8..7261ee6dd875 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3407,6 +3407,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 98782134c2f4..8449a5a3e368 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3665,8 +3665,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } +static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 flags = 0; + + if (accecn_reflector) + flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv); + __tcp_send_ack(sk, tp->rcv_nxt, flags); +} + /* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk) +static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -3696,7 +3706,7 @@ static void tcp_send_challenge_ack(struct sock *sk) WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, accecn_reflector); } } @@ -3863,7 +3873,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; } goto old_ack; @@ -5907,6 +5917,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); + bool accecn_reflector = false; SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ @@ -6004,7 +6015,7 @@ step1: if (tp->syn_fastopen && !tp->data_segs_in && sk->sk_state == TCP_ESTABLISHED) tcp_fastopen_active_disable(sk); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); SKB_DR_SET(reason, TCP_RESET); goto discard; } @@ -6015,6 +6026,8 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { + if (tcp_ecn_mode_accecn(tp)) + accecn_reflector = true; if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && @@ -6024,7 +6037,7 @@ syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, accecn_reflector); SKB_DR_SET(reason, TCP_INVALID_SYN); goto discard; } @@ -6493,7 +6506,8 @@ consume: * state to ESTABLISHED..." */ - tcp_ecn_rcv_synack(tp, th); + if (tcp_ecn_mode_any(tp)) + tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -6565,7 +6579,7 @@ consume: TCP_DELACK_MAX, false); goto consume; } - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp)); return -1; } @@ -6624,7 +6638,7 @@ consume: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - tcp_ecn_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -6806,7 +6820,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } /* accept old ack during closing */ if ((int)reason < 0) { - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); reason = -reason; goto discard; } @@ -6853,9 +6867,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); + if (tcp_ecn_mode_accecn(tp)) + tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); if (sk->sk_shutdown & SEND_SHUTDOWN) tcp_shutdown(sk, SEND_SHUTDOWN); + break; case TCP_FIN_WAIT1: { @@ -7025,6 +7042,15 @@ static void tcp_ecn_create_request(struct request_sock *req, bool ect, ecn_ok; u32 ecn_ok_dst; + if (tcp_accecn_syn_requested(th) && + READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { + inet_rsk(req)->ecn_ok = 1; + tcp_rsk(req)->accecn_ok = 1; + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + return; + } + if (!th_ecn) return; @@ -7032,7 +7058,8 @@ static void tcp_ecn_create_request(struct request_sock *req, ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; - if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + if (((!ect || th->res1 || th->ae) && ecn_ok) || + tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; @@ -7050,6 +7077,9 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; + tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->syn_ect_rcv = 0; + tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2a0602035729..6162f8dbe9d2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include #include @@ -1189,7 +1190,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { - const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; @@ -1202,6 +1203,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); tos = READ_ONCE(inet_sk(sk)->tos); @@ -3558,7 +3560,7 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { - net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c2ae07d8d5d..a4b8be6fdcdc 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -451,12 +452,23 @@ void tcp_openreq_init_rwin(struct request_sock *req, ireq->rcv_wscale = rcv_wscale; } -static void tcp_ecn_openreq_child(struct tcp_sock *tp, - const struct request_sock *req) +static void tcp_ecn_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + const struct tcp_request_sock *treq = tcp_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + + if (treq->accecn_ok) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tcp_ecn_received_counters(sk, skb); + } else { + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); + } } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -621,7 +633,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); + tcp_ecn_openreq_child(newsk, req, skb); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a3a6d3e91d84..deb9b085a8a2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -332,8 +332,9 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, return; if (tcp_ecn_mode_accecn(tp)) { - INET_ECN_xmit(sk); - tcp_accecn_set_ace(th, tp); + if (!tcp_accecn_ace_fail_recv(tp)) + INET_ECN_xmit(sk); + tcp_accecn_set_ace(tp, skb, th); skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ @@ -3356,7 +3357,10 @@ start: tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback */ + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check covers both + * cases. + */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f0ee1a909771..7e007f013ec8 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -16,6 +16,7 @@ #include #include #include +#include #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -264,6 +265,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); + tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, dst); if (!ret) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 08dabc47a6e7..5f0a138f4220 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -544,6 +544,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK; __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); -- cgit v1.2.3 From 9a011277445583bab002fbf5043fab0ea03dc5dd Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:27 +0200 Subject: tcp: accecn: add AccECN rx byte counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These three byte counters track IP ECN field payload byte sums for all arriving (acceptable) packets for ECT0, ECT1, and CE. The AccECN option (added by a later patch in the series) echoes these counters back to sender side; therefore, it is placed within the group of tcp_sock_write_txrx. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_txrx is increased from 95 + 4 to 107 + 4 and an extra 4-byte hole is created but will be exploited in later patches: [BEFORE THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 app_limited; /* 2584 4 */ u32 rcv_wnd; /* 2588 4 */ struct tcp_options_received rx_opt; /* 2592 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */ [...] /* size: 3200, cachelines: 50, members: 166 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u32 delivered_ce; /* 2576 4 */ u32 received_ce; /* 2580 4 */ u32 received_ecn_bytes[3];/* 2584 12 */ u32 app_limited; /* 2596 4 */ u32 rcv_wnd; /* 2600 4 */ struct tcp_options_received rx_opt; /* 2604 24 */ __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 3200, cachelines: 50, members: 167 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-4-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- .../networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 4 +++ include/net/tcp_ecn.h | 29 +++++++++++++++++++++- net/ipv4/tcp.c | 3 ++- net/ipv4/tcp_input.c | 7 +++--- net/ipv4/tcp_minisocks.c | 2 +- 6 files changed, 40 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 4f71ece7c655..5a2b0af57364 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -102,6 +102,7 @@ u32 prr_out read_mostly read_m u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) u32 received_ce read_mostly read_write +u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b8432bed546d..012d01347b3c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -306,6 +306,10 @@ struct tcp_sock { u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 received_ce; /* Like the above but for rcvd CE marked pkts */ + u32 received_ecn_bytes[3]; /* received byte counters for three ECN + * types: INET_ECN_ECT_1, INET_ECN_ECT_0, + * and INET_ECN_CE + */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ /* diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index da0b355418bd..1a41a459aa07 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -171,7 +171,7 @@ static inline void tcp_accecn_third_ack(struct sock *sk, /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, u32 len) { u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); @@ -191,9 +191,24 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce += pcount; tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); + + if (len > 0) + tp->received_ecn_bytes[ecnfield - 1] += len; } } +/* AccECN specification, 2.2: [...] A Data Receiver maintains four counters + * initialized at the start of the half-connection. [...] These byte counters + * reflect only the TCP payload length, excluding TCP header and TCP options. + */ +static inline void tcp_ecn_received_counters_payload(struct sock *sk, + const struct sk_buff *skb) +{ + const struct tcphdr *th = (const struct tcphdr *)skb->data; + + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); +} + /* AccECN specification, 5.1: [...] a server can determine that it * negotiated AccECN as [...] if the ACK contains an ACE field with * the value 0b010 to 0b111 (decimal 2 to 7). @@ -232,10 +247,22 @@ static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) return ace && ace != 0x3; } +static inline void __tcp_accecn_init_bytes_counters(int *counter_array) +{ + BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); + BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); + BUILD_BUG_ON(INET_ECN_CE != 0x3); + + counter_array[INET_ECN_ECT_1 - 1] = 0; + counter_array[INET_ECN_ECT_0 - 1] = 0; + counter_array[INET_ECN_CE - 1] = 0; +} + static inline void tcp_accecn_init_counters(struct tcp_sock *tp) { tp->received_ce = 0; tp->received_ce_pending = 0; + __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); } /* Used for make_synack to form the ACE flags */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7261ee6dd875..a45a4184b603 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5142,6 +5142,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); @@ -5149,7 +5150,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 107 + 4); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8449a5a3e368..636a63383412 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6163,7 +6163,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, 0); /* We know that such packets are checksummed * on entry. @@ -6213,7 +6213,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters(sk, skb, + len - tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6254,7 +6255,7 @@ validate: return; step5: - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters_payload(sk, skb); reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index a4b8be6fdcdc..1dbcc09ff7a9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,7 +463,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); - tcp_ecn_received_counters(sk, skb); + tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? TCP_ECN_MODE_RFC3168 : -- cgit v1.2.3 From b5e74132dfbe60329b3ff0e5c485039f2e31605c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:30 +0200 Subject: tcp: accecn: AccECN option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Accurate ECN allows echoing back the sum of bytes for each IP ECN field value in the received packets using AccECN option. This change implements AccECN option tx & rx side processing without option send control related features that are added by a later change. Based on specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt (Some features of the spec will be added in the later changes rather than in this one). A full-length AccECN option is always attempted but if it does not fit, the minimum length is selected based on the counters that have changed since the last update. The AccECN option (with 24-bit fields) often ends in odd sizes so the option write code tries to take advantage of some nop used to pad the other TCP options. The delivered_ecn_bytes pairs with received_ecn_bytes similar to how delivered_ce pairs with received_ce. In contrast to ACE field, however, the option is not always available to update delivered_ecn_bytes. For ACK w/o AccECN option, the delivered bytes calculated based on the cumulative ACK+SACK information are assigned to one of the counters using an estimation heuristic to select the most likely ECN byte counter. Any estimation error is corrected when the next AccECN option arrives. It may occur that the heuristic gets too confused when there are enough different byte counter deltas between ACKs with the AccECN option in which case the heuristic just gives up on updating the counters for a while. tcp_ecn_option sysctl can be used to select option sending mode for AccECN: TCP_ECN_OPTION_DISABLED, TCP_ECN_OPTION_MINIMUM, and TCP_ECN_OPTION_FULL. This patch increases the size of tcp_info struct, as there is no existing holes for new u32 variables. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ /* size: 248, cachelines: 4, members: 61 */ } [AFTER THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ __u32 tcpi_received_ce; /* 248 4 */ __u32 tcpi_delivered_e1_bytes; /* 252 4 */ __u32 tcpi_delivered_e0_bytes; /* 256 4 */ __u32 tcpi_delivered_ce_bytes; /* 260 4 */ __u32 tcpi_received_e1_bytes; /* 264 4 */ __u32 tcpi_received_e0_bytes; /* 268 4 */ __u32 tcpi_received_ce_bytes; /* 272 4 */ /* size: 280, cachelines: 5, members: 68 */ } This patch uses the existing 1-byte holes in the tcp_sock_write_txrx group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx group after the new u32 delivered_ecn_bytes[3] member. Therefore, the group size of tcp_sock_write_rx is increased from 96 to 112. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4; /* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2728 0 */ [...] /* size: 3200, cachelines: 50, members: 167 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ u32 delivered_ecn_bytes[3];/* 2672 12 */ /* XXX 4 bytes hole, try to pack */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2744 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-7-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 19 +++ .../networking/net_cachelines/tcp_sock.rst | 3 + include/linux/tcp.h | 9 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 13 ++ include/net/tcp_ecn.h | 89 ++++++++++- include/uapi/linux/tcp.h | 7 + net/ipv4/sysctl_net_ipv4.c | 9 ++ net/ipv4/tcp.c | 15 +- net/ipv4/tcp_input.c | 94 +++++++++++- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 165 ++++++++++++++++++++- 12 files changed, 412 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 3d8eb54b84f9..1c206501b973 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -468,6 +468,25 @@ tcp_ecn - INTEGER Default: 2 +tcp_ecn_option - INTEGER + Control Accurate ECN (AccECN) option sending when AccECN has been + successfully negotiated during handshake. Send logic inhibits + sending AccECN options regarless of this setting when no AccECN + option has been seen for the reverse direction. + + Possible values are: + + = ============================================================ + 0 Never send AccECN option. This also disables sending AccECN + option in SYN/ACK during handshake. + 1 Send AccECN option sparingly according to the minimum option + rules outlined in draft-ietf-tcpm-accurate-ecn. + 2 Send AccECN option on every packet whenever it fits into TCP + option space. + = ============================================================ + + Default: 2 + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 5a2b0af57364..b941151f8c0a 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -104,8 +104,11 @@ u32 delivered_ce read_mostly read_w u32 received_ce read_mostly read_write u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u32[3] delivered_ecn_bytes read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write +u8:2 accecn_minlen write_mostly read_write +u8:2 est_ecnfield read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 012d01347b3c..73557656cb2d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -122,8 +122,9 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ - u8 saw_unknown:1, /* Received unknown option */ - unused:7; + u8 accecn:6, /* AccECN index in header, 0=no options */ + saw_unknown:1, /* Received unknown option */ + unused:1; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -293,6 +294,9 @@ struct tcp_sock { rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ unused2:4; + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ + est_ecnfield:2,/* ECN field for AccECN delivered estimates */ + unused3:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -337,6 +341,7 @@ struct tcp_sock { u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_rtt_last_tsecr; + u32 delivered_ecn_bytes[3]; u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 54a7d187f62a..acbb7dd497e1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -148,6 +148,7 @@ struct netns_ipv4 { struct local_ports ip_local_ports; u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_option; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index da8c6640ead3..6be29129465e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -213,6 +213,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ +#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */ +#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -230,6 +232,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 +#define TCPOLEN_ACCECN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 @@ -243,6 +246,13 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 +#define TCPOLEN_ACCECN_PERFIELD 3 + +/* Maximum number of byte counters in AccECN option + size */ +#define TCP_ACCECN_NUMFIELDS 3 +#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ + TCPOLEN_ACCECN_PERFIELD * \ + TCP_ACCECN_NUMFIELDS) /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -981,6 +991,9 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) * See draft-ietf-tcpm-accurate-ecn for the latest values. */ #define TCP_ACCECN_CEP_INIT_OFFSET 5 +#define TCP_ACCECN_E1B_INIT_OFFSET 1 +#define TCP_ACCECN_E0B_INIT_OFFSET 1 +#define TCP_ACCECN_CEB_INIT_OFFSET 0 /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 1a41a459aa07..08c7f4757e4e 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -24,6 +24,13 @@ enum tcp_ecn_mode { TCP_ECN_IN_ACCECN_OUT_NOECN = 5, }; +/* AccECN option sending when AccECN has been successfully negotiated */ +enum tcp_accecn_option { + TCP_ACCECN_OPTION_DISABLED = 0, + TCP_ACCECN_OPTION_MINIMUM = 1, + TCP_ACCECN_OPTION_FULL = 2, +}; + static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ @@ -169,6 +176,79 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } } +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). + */ +static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return 1; + case INET_ECN_CE: + return 2; + case INET_ECN_ECT_0: + return 3; + } + return 0; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field value offset. + * Some fields do not start from zero, to detect zeroing by middleboxes. + */ +static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return TCP_ACCECN_E1B_INIT_OFFSET; + case INET_ECN_CE: + return TCP_ACCECN_CEB_INIT_OFFSET; + case INET_ECN_ECT_0: + return TCP_ACCECN_E0B_INIT_OFFSET; + } + return 0; +} + +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, + bool order) +{ + /* Based on Table 5 of the AccECN spec to map (option, order) to + * the corresponding ECN conuters (ECT-1, ECT-0, or CE). + */ + static const u8 optfield_lookup[2][3] = { + /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ + { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, + /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ + { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } + }; + + return optfield_lookup[order][option % 3]; +} + +/* Handles AccECN option ECT and CE 24-bit byte counters update into + * the u32 value in tcp_sock. As we're processing TCP options, it is + * safe to access from - 1. + */ +static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, + u32 init_offset) +{ + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & + 0xFFFFFFU; + u32 delta = (truncated - *cnt) & 0xFFFFFFU; + + /* If delta has the highest bit set (24th bit) indicating + * negative, sign extend to correct an estimation using + * sign_extend32(delta, 24 - 1) + */ + delta = sign_extend32(delta, 23); + *cnt += delta; + return (s32)delta; +} + /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, u32 len) @@ -192,8 +272,12 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); - if (len > 0) + if (len > 0) { + u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); tp->received_ecn_bytes[ecnfield - 1] += len; + tp->accecn_minlen = max_t(u8, tp->accecn_minlen, + minlen); + } } } @@ -263,6 +347,9 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) tp->received_ce = 0; tp->received_ce_pending = 0; __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); + __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); + tp->accecn_minlen = 0; + tp->est_ecnfield = 0; } /* Used for make_synack to form the ACE flags */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index bdac8c42fa82..53e0e85b52be 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -316,6 +316,13 @@ struct tcp_info { * in milliseconds, including any * unfinished recovery. */ + __u32 tcpi_received_ce; /* # of CE marks received */ + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ + __u32 tcpi_delivered_e0_bytes; + __u32 tcpi_delivered_ce_bytes; + __u32 tcpi_received_e1_bytes; + __u32 tcpi_received_e0_bytes; + __u32 tcpi_received_ce_bytes; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 268f8b86e8a7..4a697acb4e85 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -731,6 +731,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_ecn_mode_max, }, + { + .procname = "tcp_ecn_option", + .data = &init_net.ipv4.sysctl_tcp_ecn_option, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a45a4184b603..8c4a4b8666fc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include #include +#include #include #include #include @@ -4155,6 +4156,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; @@ -4281,6 +4285,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_received_ce = tp->received_ce; + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -5162,12 +5174,13 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 112); } void __init tcp_init(void) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8b48a3c00945..e898a76c485e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -384,6 +385,73 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) } } +/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp, + const struct sk_buff *skb, + u32 delivered_bytes, int flag) +{ + u8 estimate_ecnfield = tp->est_ecnfield; + bool ambiguous_ecn_bytes_incr = false; + bool first_changed = false; + unsigned int optlen; + bool order1, res; + unsigned int i; + u8 *ptr; + + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (estimate_ecnfield) { + u8 ecnfield = estimate_ecnfield - 1; + + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; + return true; + } + return false; + } + + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; + optlen = ptr[1] - 2; + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return false; + order1 = (ptr[0] == TCPOPT_ACCECN1); + ptr += 2; + + res = !!estimate_ecnfield; + for (i = 0; i < 3; i++) { + u32 init_offset; + u8 ecnfield; + s32 delta; + u32 *cnt; + + if (optlen < TCPOLEN_ACCECN_PERFIELD) + break; + + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); + init_offset = tcp_accecn_field_init_offset(ecnfield); + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); + if (delta && delta < 0) { + res = false; + ambiguous_ecn_bytes_incr = true; + } + if (delta && ecnfield != estimate_ecnfield) { + if (!first_changed) { + tp->est_ecnfield = ecnfield; + first_changed = true; + } else { + res = false; + ambiguous_ecn_bytes_incr = true; + } + } + + optlen -= TCPOLEN_ACCECN_PERFIELD; + ptr += TCPOLEN_ACCECN_PERFIELD; + } + if (ambiguous_ecn_bytes_incr) + tp->est_ecnfield = 0; + + return res; +} + static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; @@ -400,7 +468,8 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int flag) + u32 delivered_pkts, u32 delivered_bytes, + int flag) { const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); @@ -411,6 +480,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; + tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) @@ -436,12 +507,14 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, } static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int *flag) + u32 delivered_pkts, u32 delivered_bytes, + int *flag) { struct tcp_sock *tp = tcp_sk(sk); u32 delta; - delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + delta = __tcp_accecn_process(sk, skb, delivered_pkts, + delivered_bytes, *flag); if (delta > 0) { tcp_count_delivered_ce(tp, delta); *flag |= FLAG_ECE; @@ -3973,6 +4046,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); @@ -4012,6 +4086,7 @@ no_queue: if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ @@ -4139,6 +4214,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { @@ -4230,6 +4306,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + /* Save offset of AccECN option in TCP header */ + opt_rx->accecn = (ptr - 2) - (__u8 *)th; + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. @@ -4290,11 +4372,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { - if (tcp_parse_aligned_timestamp(tp, th)) + if (tcp_parse_aligned_timestamp(tp, th)) { + tp->rx_opt.accecn = 0; return true; + } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -6119,6 +6204,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6162f8dbe9d2..aa8dbfe20924 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3561,6 +3561,7 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; + net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5be2b3eb73d3..34e5c83bbace 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,6 +385,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) +#define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -406,6 +407,8 @@ struct tcp_out_options { u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 num_accecn_fields:7, /* number of AccECN fields needed */ + use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ @@ -603,6 +606,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp, return ptr; } +/* Initial values for AccECN option, ordered is based on ECN field bits + * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. + */ +static const u32 synack_ecn_bytes[3] = { 0, 0, 0 }; + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -621,6 +629,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts, struct tcp_key *key) { + u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */ + u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ @@ -656,15 +666,71 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } + if (OPTION_ACCECN & options) { + const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? + synack_ecn_bytes : + tp->received_ecn_bytes; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ce_idx = INET_ECN_CE - 1; + u32 e0b; + u32 e1b; + u32 ceb; + u8 len; + + e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; + e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; + ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; + len = TCPOLEN_ACCECN_BASE + + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; + + if (opts->num_accecn_fields == 2) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + } else if (opts->num_accecn_fields == 1) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + leftover_highbyte = e1b & 0xff; + leftover_lowbyte = TCPOPT_NOP; + } else if (opts->num_accecn_fields == 0) { + leftover_highbyte = TCPOPT_ACCECN1; + leftover_lowbyte = len; + } else if (opts->num_accecn_fields == 3) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + *ptr++ = htonl(((e0b & 0xffffff) << 8) | + TCPOPT_NOP); + } + if (tp) + tp->accecn_minlen = 0; + } + if (unlikely(OPTION_SACK_ADVERTISE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_WSCALE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | + u8 highbyte = TCPOPT_NOP; + + /* Do not split the leftover 2-byte to fit into a single + * NOP, i.e., replace this NOP only when 1 byte is leftover + * within leftover_highbyte. + */ + if (unlikely(leftover_highbyte != TCPOPT_NOP && + leftover_lowbyte == TCPOPT_NOP)) { + highbyte = leftover_highbyte; + leftover_highbyte = TCPOPT_NOP; + } + *ptr++ = htonl((highbyte << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); @@ -675,11 +741,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, tp->duplicate_sack : tp->selective_acks; int this_sack; - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { @@ -688,6 +756,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, } tp->rx_opt.dsack = 0; + } else if (unlikely(leftover_highbyte != TCPOPT_NOP || + leftover_lowbyte != TCPOPT_NOP)) { + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | + (TCPOPT_NOP << 8) | + TCPOPT_NOP); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { @@ -768,6 +844,61 @@ static void mptcp_set_option_cond(const struct request_sock *req, } } +static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) +{ + /* How much there's room for combining with the alignment padding? */ + if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == + OPTION_SACK_ADVERTISE) + return 2; + else if (opts->options & OPTION_WSCALE) + return 1; + return 0; +} + +/* Calculates how long AccECN option will fit to @remaining option space. + * + * AccECN option can sometimes replace NOPs used for alignment of other + * TCP options (up to @max_combine_saving available). + * + * Only solutions with at least @required AccECN fields are accepted. + * + * Returns: The size of the AccECN option excluding space repurposed from + * the alignment of the other options. + */ +static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, + int remaining) +{ + int size = TCP_ACCECN_MAXSIZE; + int max_combine_saving; + int align_size; + + if (opts->use_synack_ecn_bytes) + max_combine_saving = tcp_synack_options_combine_saving(opts); + else + max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0; + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + while (opts->num_accecn_fields >= required) { + /* Pad to dword if cannot combine */ + if ((size & 0x3) > max_combine_saving) + align_size = ALIGN(size, 4); + else + align_size = ALIGN_DOWN(size, 4); + + if (remaining >= align_size) { + size = align_size; + break; + } + + opts->num_accecn_fields--; + size -= TCPOLEN_ACCECN_PERFIELD; + } + if (opts->num_accecn_fields < required) + return 0; + + opts->options |= OPTION_ACCECN; + return size; +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -850,6 +981,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && + tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE)) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -867,6 +1007,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; + struct tcp_request_sock *treq = tcp_rsk(req); if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; @@ -929,6 +1070,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + if (treq->accecn_ok && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); @@ -1001,6 +1149,13 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } + if (tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; -- cgit v1.2.3 From aa55a7dde7ec506bb23448a5005ae3f4f809d022 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:31 +0200 Subject: tcp: accecn: AccECN option send control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of sending the option in every ACK, limit sending to those ACKs where the option is necessary: - Handshake - "Change-triggered ACK" + the ACK following it. The 2nd ACK is necessary to unambiguously indicate which of the ECN byte counters in increasing. The first ACK has two counters increasing due to the ecnfield edge. - ACKs with CE to allow CEP delta validations to take advantage of the option. - Force option to be sent every at least once per 2^22 bytes. The check is done using the bit edges of the byte counters (avoids need for extra variables). - AccECN option beacon to send a few times per RTT even if nothing in the ECN state requires that. The default is 3 times per RTT, and its period can be set via sysctl_tcp_ecn_option_beacon. Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_tx is increased from 89 to 97 due to the new u64 accecn_opt_tstamp member: [BEFORE THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ struct list_head tsorted_sent_queue; /* 2496 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2521 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */ u8 nonagle:4; /* 2521: 0 1 */ u8 rate_app_limited:1; /* 2521: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2488 8 */ u64 accecn_opt_tstamp; /* 2596 8 */ struct list_head tsorted_sent_queue; /* 2504 16 */ [...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2529 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2529 0 */ u8 nonagle:4; /* 2529: 0 1 */ u8 rate_app_limited:1; /* 2529: 4 1 */ /* XXX 3 bits hole, try to pack */ /* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2530: 0 1 */ u8 unused2:4; /* 2530: 4 1 */ u8 accecn_minlen:2; /* 2531: 0 1 */ u8 est_ecnfield:2; /* 2531: 2 1 */ u8 accecn_opt_demand:2; /* 2531: 4 1 */ u8 prev_ecnfield:2; /* 2531: 6 1 */ [...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2636 0 */ [...] /* size: 3200, cachelines: 50, members: 173 */ } Signed-off-by: Chia-Yu Chang Co-developed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-8-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 6 +++ .../networking/net_cachelines/tcp_sock.rst | 3 ++ include/linux/tcp.h | 4 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 ++ include/net/tcp_ecn.h | 52 ++++++++++++++++++++++ net/ipv4/sysctl_net_ipv4.c | 9 ++++ net/ipv4/tcp.c | 5 ++- net/ipv4/tcp_input.c | 4 +- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 2 + net/ipv4/tcp_output.c | 26 ++++++++--- 12 files changed, 107 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 1c206501b973..a06cb99d66dc 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -487,6 +487,12 @@ tcp_ecn_option - INTEGER Default: 2 +tcp_ecn_option_beacon - INTEGER + Control Accurate ECN (AccECN) option sending frequency per RTT and it + takes effect only when tcp_ecn_option is set to 2. + + Default: 3 (AccECN will be send at least 3 times per RTT) + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index b941151f8c0a..d4dc01800945 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -109,6 +109,9 @@ u8:2 syn_ect_snt write_mostly read_w u8:2 syn_ect_rcv read_mostly read_write u8:2 accecn_minlen write_mostly read_write u8:2 est_ecnfield read_write +u8:2 accecn_opt_demand read_mostly read_write +u8:2 prev_ecnfield read_write +u64 accecn_opt_tstamp read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 73557656cb2d..f637b659b35a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -275,6 +275,7 @@ struct tcp_sock { u32 mdev_us; /* medium deviation */ u32 rtt_seq; /* sequence number to update rttvar */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ + u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set @@ -296,7 +297,8 @@ struct tcp_sock { unused2:4; u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ est_ecnfield:2,/* ECN field for AccECN delivered estimates */ - unused3:4; + accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */ + prev_ecnfield:2; /* ECN bits from the previous segment */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index acbb7dd497e1..34eb3aecb3f2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -149,6 +149,7 @@ struct netns_ipv4 { u8 sysctl_tcp_ecn; u8 sysctl_tcp_ecn_option; + u8 sysctl_tcp_ecn_option_beacon; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index 6be29129465e..78dd7b8a4145 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -100,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U +/* Default sending frequency of accurate ECN option per RTT */ +#define TCP_ACCECN_OPTION_BEACON 3 + /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 08c7f4757e4e..133fb6b79500 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -176,6 +176,17 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } } +/* Demand the minimum # to send AccECN optnio */ +static inline void tcp_accecn_opt_demand_min(struct sock *sk, + u8 opt_demand_min) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 opt_demand; + + opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand); + tp->accecn_opt_demand = opt_demand; +} + /* Maps IP ECN field ECT/CE code point to AccECN option field number, given * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). */ @@ -256,6 +267,7 @@ static inline void tcp_ecn_received_counters(struct sock *sk, u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); struct tcp_sock *tp = tcp_sk(sk); + bool ecn_edge; if (!INET_ECN_is_not_ect(ecnfield)) { u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); @@ -274,9 +286,34 @@ static inline void tcp_ecn_received_counters(struct sock *sk, if (len > 0) { u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); + u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1]; + u32 bytes_mask = GENMASK_U32(31, 22); + tp->received_ecn_bytes[ecnfield - 1] += len; tp->accecn_minlen = max_t(u8, tp->accecn_minlen, minlen); + + /* Send AccECN option at least once per 2^22-byte + * increase in any ECN byte counter. + */ + if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) & + bytes_mask) { + tcp_accecn_opt_demand_min(sk, 1); + } + } + } + + ecn_edge = tp->prev_ecnfield != ecnfield; + if (ecn_edge || is_ce) { + tp->prev_ecnfield = ecnfield; + /* Demand Accurate ECN change-triggered ACKs. Two ACK are + * demanded to indicate unambiguously the ecnfield value + * in the latter ACK. + */ + if (tcp_ecn_mode_accecn(tp)) { + if (ecn_edge) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + tp->accecn_opt_demand = 2; } } } @@ -349,6 +386,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); tp->accecn_minlen = 0; + tp->accecn_opt_demand = 0; tp->est_ecnfield = 0; } @@ -431,6 +469,7 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + tp->accecn_opt_demand = 2; if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { @@ -451,6 +490,7 @@ static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, } else { tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + tp->prev_ecnfield = tp->syn_ect_rcv; tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); } } @@ -542,4 +582,16 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) th->ece = 1; } +static inline bool tcp_accecn_option_beacon_check(const struct sock *sk) +{ + u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon); + const struct tcp_sock *tp = tcp_sk(sk); + + if (!ecn_beacon) + return false; + + return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >= + (tp->srtt_us >> 3); +} + #endif /* _LINUX_TCP_ECN_H */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4a697acb4e85..24dbc603cc44 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -740,6 +740,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_ecn_option_beacon", + .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8c4a4b8666fc..090f9ac43d4c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3410,6 +3410,8 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); + tp->prev_ecnfield = 0; + tp->accecn_opt_tstamp = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5134,11 +5136,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e898a76c485e..87154fd86167 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6121,8 +6121,10 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { - if (tcp_ecn_mode_accecn(tp)) + if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; + tcp_accecn_opt_demand_min(sk, 1); + } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index aa8dbfe20924..6a63be1f6461 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3562,6 +3562,7 @@ static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; + net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1dbcc09ff7a9..193343494558 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,8 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 34e5c83bbace..f897c2594954 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -705,8 +705,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(((e0b & 0xffffff) << 8) | TCPOPT_NOP); } - if (tp) + if (tp) { tp->accecn_minlen = 0; + tp->accecn_opt_tstamp = tp->tcp_mstamp; + if (tp->accecn_opt_demand) + tp->accecn_opt_demand--; + } } if (unlikely(OPTION_SACK_ADVERTISE & options)) { @@ -1149,11 +1153,16 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } - if (tcp_ecn_mode_accecn(tp) && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { - opts->use_synack_ecn_bytes = 0; - size += tcp_options_fit_accecn(opts, tp->accecn_minlen, - MAX_TCP_OPTION_SPACE - size); + if (tcp_ecn_mode_accecn(tp)) { + int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); + + if (ecn_opt && + (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || + tcp_accecn_option_beacon_check(sk))) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, @@ -2863,6 +2872,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0; tcp_mstamp_refresh(tp); + + /* AccECN option beacon depends on mstamp, it may change mss */ + if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk)) + mss_now = tcp_current_mss(sk); + if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); -- cgit v1.2.3 From b40671b5ee588c8a61b2d0eacbad32ffc57e9a8f Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:32 +0200 Subject: tcp: accecn: AccECN option failure handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AccECN option may fail in various way, handle these: - Attempt to negotiate the use of AccECN on the 1st retransmitted SYN - From the 2nd retransmitted SYN, stop AccECN negotiation - Remove option from SYN/ACK rexmits to handle blackholes - If no option arrives in SYN/ACK, assume Option is not usable - If an option arrives later, re-enabled - If option is zeroed, disable AccECN option processing This patch use existing padding bits in tcp_request_sock and holes in tcp_sock without increasing the size. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-9-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 4 +++- include/net/tcp_ecn.h | 51 +++++++++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_input.c | 35 +++++++++++++++++++++++++++++++-- net/ipv4/tcp_minisocks.c | 14 +++++++++++++ net/ipv4/tcp_output.c | 11 ++++++++--- 7 files changed, 111 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f637b659b35a..3ca5ed02de6d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -173,6 +173,7 @@ struct tcp_request_sock { u8 syn_ect_snt: 2, syn_ect_rcv: 2, accecn_fail_mode:4; + u8 saw_accecn_opt :2; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -407,7 +408,8 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ - u8 accecn_fail_mode:4; /* AccECN failure handling */ + u8 accecn_fail_mode:4, /* AccECN failure handling */ + saw_accecn_opt:2; /* An AccECN option was seen */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 133fb6b79500..f13e5cd2b1ac 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -91,6 +91,11 @@ static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) tp->accecn_fail_mode |= mode; } +#define TCP_ACCECN_OPT_NOT_SEEN 0x0 +#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 +#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 +#define TCP_ACCECN_OPT_FAIL_SEEN 0x3 + static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; @@ -146,6 +151,14 @@ static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, return true; } +static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, + u8 saw_opt) +{ + tp->saw_accecn_opt = saw_opt; + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); +} + /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ static inline void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, u8 sent_ect) @@ -428,9 +441,35 @@ static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, } } +static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, + u8 opt_offset) +{ + u8 *ptr = skb_transport_header(skb) + opt_offset; + unsigned int optlen = ptr[1] - 2; + + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return TCP_ACCECN_OPT_FAIL_SEEN; + ptr += 2; + + /* Detect option zeroing: an AccECN connection "MAY check that the + * initial value of the EE0B field or the EE1B field is non-zero" + */ + if (optlen < TCPOLEN_ACCECN_PERFIELD) + return TCP_ACCECN_OPT_EMPTY_SEEN; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) + return TCP_ACCECN_OPT_COUNTER_SEEN; + ptr += TCPOLEN_ACCECN_PERFIELD * 2; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + + return TCP_ACCECN_OPT_COUNTER_SEEN; +} + /* See Table 2 of the AccECN draft */ -static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, - u8 ip_dsfield) +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, + const struct tcphdr *th, u8 ip_dsfield) { struct tcp_sock *tp = tcp_sk(sk); u8 ace = tcp_accecn_ace(th); @@ -469,7 +508,13 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; - tp->accecn_opt_demand = 2; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tp->accecn_opt_demand = 2; + } if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 53e0e85b52be..dce3113787a7 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -323,6 +323,8 @@ struct tcp_info { __u32 tcpi_received_e1_bytes; __u32 tcpi_received_e0_bytes; __u32 tcpi_received_ce_bytes; + __u16 tcpi_accecn_fail_mode; + __u16 tcpi_accecn_opt_seen; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 090f9ac43d4c..5b5c655ded1d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3409,6 +3409,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered = 0; tp->delivered_ce = 0; tp->accecn_fail_mode = 0; + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; @@ -4287,6 +4288,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; + info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 87154fd86167..5732f2d4329c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -398,7 +398,22 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, unsigned int i; u8 *ptr; + if (tcp_accecn_opt_fail_recv(tp)) + return false; + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (!tp->saw_accecn_opt) { + /* Too late to enable after this point due to + * potential counter wraps + */ + if (tp->bytes_sent >= (1 << 23) - 1) { + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + } + return false; + } + if (estimate_ecnfield) { u8 ecnfield = estimate_ecnfield - 1; @@ -415,6 +430,13 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, order1 = (ptr[0] == TCPOPT_ACCECN1); ptr += 2; + if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); + } + res = !!estimate_ecnfield; for (i = 0; i < 3; i++) { u32 init_offset; @@ -6123,7 +6145,13 @@ step1: if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; - tcp_accecn_opt_demand_min(sk, 1); + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tcp_accecn_opt_demand_min(sk, 1); + } } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && @@ -6606,7 +6634,8 @@ consume: */ if (tcp_ecn_mode_any(tp)) - tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); + tcp_ecn_rcv_synack(sk, skb, th, + TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -7177,6 +7206,8 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_rsk(req)->accecn_fail_mode = 0; tcp_rsk(req)->syn_ect_rcv = 0; tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 193343494558..327095ef95ef 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); @@ -678,6 +679,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool own_req; tmp_opt.saw_tstamp = 0; + tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); @@ -855,6 +857,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); + + tcp_rsk(req)->saw_accecn_opt = saw_opt; + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; + + tcp_rsk(req)->accecn_fail_mode |= fail_mode; + } + } + /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f897c2594954..65b90f73daa0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -985,9 +985,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } - /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + /* Simultaneous open SYN/ACK needs AccECN option but not SYN. + * It is attempted to negotiate the use of AccECN also on the first + * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" + * of AccECN draft. + */ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && tcp_ecn_mode_accecn(tp) && + inet_csk(sk)->icsk_retransmits < 2 && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && remaining >= TCPOLEN_ACCECN_BASE)) { opts->use_synack_ecn_bytes = 1; @@ -1076,7 +1081,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, if (treq->accecn_ok && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && - remaining >= TCPOLEN_ACCECN_BASE) { + req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } @@ -1156,7 +1161,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp)) { int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - if (ecn_opt && + if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; -- cgit v1.2.3 From 3fbb2a6f3a70c27a6a2be80d131970608c0f84d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:42 +0000 Subject: ipv6: make ipv6_pinfo.saddr_cache a boolean ipv6_pinfo.saddr_cache is either NULL or &np->saddr. We do not need 8 bytes, a boolean is enough. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-2-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 4 ++-- include/net/ip6_route.h | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/route.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index f43314517396..55c4d1e4dd7d 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -216,10 +216,10 @@ struct inet6_cork { struct ipv6_pinfo { struct in6_addr saddr; struct in6_pktinfo sticky_pktinfo; - const struct in6_addr *daddr_cache; #ifdef CONFIG_IPV6_SUBTREES - const struct in6_addr *saddr_cache; + bool saddr_cache; #endif + const struct in6_addr *daddr_cache; __be32 flow_label; __u32 frag_size; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 59f48ca3abdf..223c02d42688 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -230,7 +230,7 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, const struct in6_addr *daddr, - const struct in6_addr *saddr) + bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -238,7 +238,7 @@ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES - np->saddr_cache = saddr; + np->saddr_cache = saddr_set; #endif } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1992621e3f3f..c342f8daea7f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 333e43434dd7..1947ccdb00df 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); } return dst; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 9d64c13bab5e..82ff6e1293d0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1102,7 +1102,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, */ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || #ifdef CONFIG_IPV6_SUBTREES - ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, + np->saddr_cache ? &np->saddr : NULL) || #endif (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { dst_release(dst); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3371f16b7a3e..e1b0aebf8bf9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3036,9 +3036,9 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, &sk->sk_v6_daddr : NULL, #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? - &np->saddr : + true : #endif - NULL); + false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5f0a138f4220..ecc3a87cd8c4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -299,7 +299,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, NULL, false); icsk->icsk_ext_hdr_len = 0; if (opt) @@ -1459,7 +1459,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, NULL, false); newnp->saddr = ireq->ir_v6_loc_addr; -- cgit v1.2.3 From 5489f333ef993bfceebce9ae98944f04eaafcc30 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:43 +0000 Subject: ipv6: make ipv6_pinfo.daddr_cache a boolean ipv6_pinfo.daddr_cache is either NULL or &sk->sk_v6_daddr We do not need 8 bytes, a boolean is enough. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-3-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 2 +- include/net/ip6_route.h | 4 ++-- net/ipv6/af_inet6.c | 2 +- net/ipv6/inet6_connection_sock.c | 2 +- net/ipv6/ip6_output.c | 3 ++- net/ipv6/route.c | 3 +-- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 55c4d1e4dd7d..8e6d9f8b3dc8 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -219,7 +219,7 @@ struct ipv6_pinfo { #ifdef CONFIG_IPV6_SUBTREES bool saddr_cache; #endif - const struct in6_addr *daddr_cache; + bool daddr_cache; __be32 flow_label; __u32 frag_size; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 223c02d42688..7c5512baa4b2 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -229,14 +229,14 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) * Store a destination cache entry in a socket */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, + bool daddr_set, bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); sk_setup_caps(sk, dst); - np->daddr_cache = daddr; + np->daddr_cache = daddr_set; #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr_set; #endif diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c342f8daea7f..1b0314644e0c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 1947ccdb00df..ea5cf3fdfdd6 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); } return dst; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 82ff6e1293d0..f904739e99b9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1100,7 +1100,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, * sockets. * 2. oif also should be the same. */ - if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, + np->daddr_cache ? &sk->sk_v6_daddr : NULL) || #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache ? &np->saddr : NULL) || diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e1b0aebf8bf9..aee6a10b112a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3032,8 +3032,7 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, #endif ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? true : diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ecc3a87cd8c4..c7271f6359db 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -299,7 +299,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, false); + ip6_dst_store(sk, dst, false, false); icsk->icsk_ext_hdr_len = 0; if (opt) @@ -1459,7 +1459,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, false); + ip6_dst_store(newsk, dst, false, false); newnp->saddr = ireq->ir_v6_loc_addr; -- cgit v1.2.3 From b76543b21fbcfbb96332fd80cc0d85bbcd72d8f0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:45 +0000 Subject: ipv6: reorganise struct ipv6_pinfo Move fields used in tx fast path at the beginning of the structure, and seldom used ones at the end. Note that rxopt is also in the first cache line. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-5-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/ipv6.h | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 8e6d9f8b3dc8..43b7bb828738 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -214,18 +214,21 @@ struct inet6_cork { /* struct ipv6_pinfo - ipv6 private area */ struct ipv6_pinfo { + /* Used in tx path (inet6_csk_route_socket(), ip6_xmit()) */ struct in6_addr saddr; - struct in6_pktinfo sticky_pktinfo; + __be32 flow_label; + u32 dst_cookie; + struct ipv6_txoptions __rcu *opt; + s16 hop_limit; + u8 pmtudisc; + u8 tclass; #ifdef CONFIG_IPV6_SUBTREES bool saddr_cache; #endif bool daddr_cache; - __be32 flow_label; - __u32 frag_size; - - s16 hop_limit; u8 mcast_hops; + u32 frag_size; int ucast_oif; int mcast_oif; @@ -233,7 +236,7 @@ struct ipv6_pinfo { /* pktoption flags */ union { struct { - __u16 srcrt:1, + u16 srcrt:1, osrcrt:1, rxinfo:1, rxoinfo:1, @@ -250,29 +253,25 @@ struct ipv6_pinfo { recvfragsize:1; /* 1 bits hole */ } bits; - __u16 all; + u16 all; } rxopt; /* sockopt flags */ - __u8 srcprefs; /* 001: prefer temporary address + u8 srcprefs; /* 001: prefer temporary address * 010: prefer public address * 100: prefer care-of address */ - __u8 pmtudisc; - __u8 min_hopcount; - __u8 tclass; + u8 min_hopcount; __be32 rcv_flowinfo; + struct in6_pktinfo sticky_pktinfo; - __u32 dst_cookie; + struct sk_buff *pktoptions; + struct sk_buff *rxpmtu; + struct inet6_cork cork; struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; struct ipv6_fl_socklist __rcu *ipv6_fl_list; - - struct ipv6_txoptions __rcu *opt; - struct sk_buff *pktoptions; - struct sk_buff *rxpmtu; - struct inet6_cork cork; }; /* We currently use available bits from inet_sk(sk)->inet_flags, -- cgit v1.2.3 From 3cd04c8f4afed71a48edef0db5255afc249c2feb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Sep 2025 16:09:50 +0000 Subject: udp: make busylock per socket While having all spinlocks packed into an array was a space saver, this also caused NUMA imbalance and hash collisions. UDPv6 socket size becomes 1600 after this patch. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250916160951.541279-10-edumazet@google.com Reviewed-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/udp.h | 1 + include/net/udp.h | 1 + net/ipv4/udp.c | 20 ++------------------ 3 files changed, 4 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 6ed008ab1665..e554890c4415 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -109,6 +109,7 @@ struct udp_sock { */ struct hlist_node tunnel_list; struct numa_drop_counters drop_counters; + spinlock_t busylock ____cacheline_aligned_in_smp; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index a08822e294b0..eecd64097f91 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -289,6 +289,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); sk->sk_drop_counters = &up->drop_counters; + spin_lock_init(&up->busylock); skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 25143f932447..7d1444821ee5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1689,17 +1689,11 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line * trying to acquire the receive_queue spinlock. - * These busylock can be allocated on a per cpu manner, instead of a - * per socket one (that would consume a cache line per socket) */ -static int udp_busylocks_log __read_mostly; -static spinlock_t *udp_busylocks __read_mostly; - -static spinlock_t *busylock_acquire(void *ptr) +static spinlock_t *busylock_acquire(struct sock *sk) { - spinlock_t *busy; + spinlock_t *busy = &udp_sk(sk)->busylock; - busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); spin_lock(busy); return busy; } @@ -3997,7 +3991,6 @@ static void __init bpf_iter_register(void) void __init udp_init(void) { unsigned long limit; - unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; @@ -4006,15 +3999,6 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - /* 16 spinlocks per cpu */ - udp_busylocks_log = ilog2(nr_cpu_ids) + 4; - udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, - GFP_KERNEL); - if (!udp_busylocks) - panic("UDP: failed to alloc udp_busylocks\n"); - for (i = 0; i < (1U << udp_busylocks_log); i++) - spin_lock_init(udp_busylocks + i); - if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); -- cgit v1.2.3 From 6b88293aae7fb78872e5cc1ec36e2f750ae12e38 Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Wed, 10 Sep 2025 14:32:58 -0400 Subject: mtd: nand: move nand_check_erased_ecc_chunk() to nand/core The check function for bitflips in erased blocks will be needed by the Realtek ECC engine driver (which is currently under development). Right now it is located in raw/nand_base.c. While this is sufficient for the current usecases, there is no real dependency for an ECC engine on the raw nand library. Move the function over to a more generic place in core library. Suggested-by: Miquel Raynal Signed-off-by: Markus Stockhausen Signed-off-by: Miquel Raynal --- drivers/mtd/nand/core.c | 131 +++++++++++++++++++++++++++++++++++++++ drivers/mtd/nand/raw/nand_base.c | 131 --------------------------------------- include/linux/mtd/nand.h | 5 ++ include/linux/mtd/rawnand.h | 5 -- 4 files changed, 136 insertions(+), 136 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/nand/core.c b/drivers/mtd/nand/core.c index 7737b1a4a177..3e76d127715f 100644 --- a/drivers/mtd/nand/core.c +++ b/drivers/mtd/nand/core.c @@ -12,6 +12,137 @@ #include #include +/** + * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data + * @buf: buffer to test + * @len: buffer length + * @bitflips_threshold: maximum number of bitflips + * + * Check if a buffer contains only 0xff, which means the underlying region + * has been erased and is ready to be programmed. + * The bitflips_threshold specify the maximum number of bitflips before + * considering the region is not erased. + * Note: The logic of this function has been extracted from the memweight + * implementation, except that nand_check_erased_buf function exit before + * testing the whole buffer if the number of bitflips exceed the + * bitflips_threshold value. + * + * Returns a positive number of bitflips less than or equal to + * bitflips_threshold, or -ERROR_CODE for bitflips in excess of the + * threshold. + */ +static int nand_check_erased_buf(void *buf, int len, int bitflips_threshold) +{ + const unsigned char *bitmap = buf; + int bitflips = 0; + int weight; + + for (; len && ((uintptr_t)bitmap) % sizeof(long); + len--, bitmap++) { + weight = hweight8(*bitmap); + bitflips += BITS_PER_BYTE - weight; + if (unlikely(bitflips > bitflips_threshold)) + return -EBADMSG; + } + + for (; len >= sizeof(long); + len -= sizeof(long), bitmap += sizeof(long)) { + unsigned long d = *((unsigned long *)bitmap); + if (d == ~0UL) + continue; + weight = hweight_long(d); + bitflips += BITS_PER_LONG - weight; + if (unlikely(bitflips > bitflips_threshold)) + return -EBADMSG; + } + + for (; len > 0; len--, bitmap++) { + weight = hweight8(*bitmap); + bitflips += BITS_PER_BYTE - weight; + if (unlikely(bitflips > bitflips_threshold)) + return -EBADMSG; + } + + return bitflips; +} + +/** + * nand_check_erased_ecc_chunk - check if an ECC chunk contains (almost) only + * 0xff data + * @data: data buffer to test + * @datalen: data length + * @ecc: ECC buffer + * @ecclen: ECC length + * @extraoob: extra OOB buffer + * @extraooblen: extra OOB length + * @bitflips_threshold: maximum number of bitflips + * + * Check if a data buffer and its associated ECC and OOB data contains only + * 0xff pattern, which means the underlying region has been erased and is + * ready to be programmed. + * The bitflips_threshold specify the maximum number of bitflips before + * considering the region as not erased. + * + * Note: + * 1/ ECC algorithms are working on pre-defined block sizes which are usually + * different from the NAND page size. When fixing bitflips, ECC engines will + * report the number of errors per chunk, and the NAND core infrastructure + * expect you to return the maximum number of bitflips for the whole page. + * This is why you should always use this function on a single chunk and + * not on the whole page. After checking each chunk you should update your + * max_bitflips value accordingly. + * 2/ When checking for bitflips in erased pages you should not only check + * the payload data but also their associated ECC data, because a user might + * have programmed almost all bits to 1 but a few. In this case, we + * shouldn't consider the chunk as erased, and checking ECC bytes prevent + * this case. + * 3/ The extraoob argument is optional, and should be used if some of your OOB + * data are protected by the ECC engine. + * It could also be used if you support subpages and want to attach some + * extra OOB data to an ECC chunk. + * + * Returns a positive number of bitflips less than or equal to + * bitflips_threshold, or -ERROR_CODE for bitflips in excess of the + * threshold. In case of success, the passed buffers are filled with 0xff. + */ +int nand_check_erased_ecc_chunk(void *data, int datalen, + void *ecc, int ecclen, + void *extraoob, int extraooblen, + int bitflips_threshold) +{ + int data_bitflips = 0, ecc_bitflips = 0, extraoob_bitflips = 0; + + data_bitflips = nand_check_erased_buf(data, datalen, + bitflips_threshold); + if (data_bitflips < 0) + return data_bitflips; + + bitflips_threshold -= data_bitflips; + + ecc_bitflips = nand_check_erased_buf(ecc, ecclen, bitflips_threshold); + if (ecc_bitflips < 0) + return ecc_bitflips; + + bitflips_threshold -= ecc_bitflips; + + extraoob_bitflips = nand_check_erased_buf(extraoob, extraooblen, + bitflips_threshold); + if (extraoob_bitflips < 0) + return extraoob_bitflips; + + if (data_bitflips) + memset(data, 0xff, datalen); + + if (ecc_bitflips) + memset(ecc, 0xff, ecclen); + + if (extraoob_bitflips) + memset(extraoob, 0xff, extraooblen); + + return data_bitflips + ecc_bitflips + extraoob_bitflips; +} +EXPORT_SYMBOL(nand_check_erased_ecc_chunk); + /** * nanddev_isbad() - Check if a block is bad * @nand: NAND device diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 13e4060bd1b6..c7d9501f646b 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -2783,137 +2783,6 @@ int nand_set_features(struct nand_chip *chip, int addr, return nand_set_features_op(chip, addr, subfeature_param); } -/** - * nand_check_erased_buf - check if a buffer contains (almost) only 0xff data - * @buf: buffer to test - * @len: buffer length - * @bitflips_threshold: maximum number of bitflips - * - * Check if a buffer contains only 0xff, which means the underlying region - * has been erased and is ready to be programmed. - * The bitflips_threshold specify the maximum number of bitflips before - * considering the region is not erased. - * Note: The logic of this function has been extracted from the memweight - * implementation, except that nand_check_erased_buf function exit before - * testing the whole buffer if the number of bitflips exceed the - * bitflips_threshold value. - * - * Returns a positive number of bitflips less than or equal to - * bitflips_threshold, or -ERROR_CODE for bitflips in excess of the - * threshold. - */ -static int nand_check_erased_buf(void *buf, int len, int bitflips_threshold) -{ - const unsigned char *bitmap = buf; - int bitflips = 0; - int weight; - - for (; len && ((uintptr_t)bitmap) % sizeof(long); - len--, bitmap++) { - weight = hweight8(*bitmap); - bitflips += BITS_PER_BYTE - weight; - if (unlikely(bitflips > bitflips_threshold)) - return -EBADMSG; - } - - for (; len >= sizeof(long); - len -= sizeof(long), bitmap += sizeof(long)) { - unsigned long d = *((unsigned long *)bitmap); - if (d == ~0UL) - continue; - weight = hweight_long(d); - bitflips += BITS_PER_LONG - weight; - if (unlikely(bitflips > bitflips_threshold)) - return -EBADMSG; - } - - for (; len > 0; len--, bitmap++) { - weight = hweight8(*bitmap); - bitflips += BITS_PER_BYTE - weight; - if (unlikely(bitflips > bitflips_threshold)) - return -EBADMSG; - } - - return bitflips; -} - -/** - * nand_check_erased_ecc_chunk - check if an ECC chunk contains (almost) only - * 0xff data - * @data: data buffer to test - * @datalen: data length - * @ecc: ECC buffer - * @ecclen: ECC length - * @extraoob: extra OOB buffer - * @extraooblen: extra OOB length - * @bitflips_threshold: maximum number of bitflips - * - * Check if a data buffer and its associated ECC and OOB data contains only - * 0xff pattern, which means the underlying region has been erased and is - * ready to be programmed. - * The bitflips_threshold specify the maximum number of bitflips before - * considering the region as not erased. - * - * Note: - * 1/ ECC algorithms are working on pre-defined block sizes which are usually - * different from the NAND page size. When fixing bitflips, ECC engines will - * report the number of errors per chunk, and the NAND core infrastructure - * expect you to return the maximum number of bitflips for the whole page. - * This is why you should always use this function on a single chunk and - * not on the whole page. After checking each chunk you should update your - * max_bitflips value accordingly. - * 2/ When checking for bitflips in erased pages you should not only check - * the payload data but also their associated ECC data, because a user might - * have programmed almost all bits to 1 but a few. In this case, we - * shouldn't consider the chunk as erased, and checking ECC bytes prevent - * this case. - * 3/ The extraoob argument is optional, and should be used if some of your OOB - * data are protected by the ECC engine. - * It could also be used if you support subpages and want to attach some - * extra OOB data to an ECC chunk. - * - * Returns a positive number of bitflips less than or equal to - * bitflips_threshold, or -ERROR_CODE for bitflips in excess of the - * threshold. In case of success, the passed buffers are filled with 0xff. - */ -int nand_check_erased_ecc_chunk(void *data, int datalen, - void *ecc, int ecclen, - void *extraoob, int extraooblen, - int bitflips_threshold) -{ - int data_bitflips = 0, ecc_bitflips = 0, extraoob_bitflips = 0; - - data_bitflips = nand_check_erased_buf(data, datalen, - bitflips_threshold); - if (data_bitflips < 0) - return data_bitflips; - - bitflips_threshold -= data_bitflips; - - ecc_bitflips = nand_check_erased_buf(ecc, ecclen, bitflips_threshold); - if (ecc_bitflips < 0) - return ecc_bitflips; - - bitflips_threshold -= ecc_bitflips; - - extraoob_bitflips = nand_check_erased_buf(extraoob, extraooblen, - bitflips_threshold); - if (extraoob_bitflips < 0) - return extraoob_bitflips; - - if (data_bitflips) - memset(data, 0xff, datalen); - - if (ecc_bitflips) - memset(ecc, 0xff, ecclen); - - if (extraoob_bitflips) - memset(extraoob, 0xff, extraooblen); - - return data_bitflips + ecc_bitflips + extraoob_bitflips; -} -EXPORT_SYMBOL(nand_check_erased_ecc_chunk); - /** * nand_read_page_raw_notsupp - dummy read raw page function * @chip: nand chip info structure diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 07486168d104..09c8c93e4dba 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -1136,4 +1136,9 @@ static inline bool nanddev_bbt_is_initialized(struct nand_device *nand) int nanddev_mtd_erase(struct mtd_info *mtd, struct erase_info *einfo); int nanddev_mtd_max_bad_blocks(struct mtd_info *mtd, loff_t offs, size_t len); +int nand_check_erased_ecc_chunk(void *data, int datalen, + void *ecc, int ecclen, + void *extraoob, int extraooblen, + int threshold); + #endif /* __LINUX_MTD_NAND_H */ diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h index e84522e31301..d30bdc3fcfd7 100644 --- a/include/linux/mtd/rawnand.h +++ b/include/linux/mtd/rawnand.h @@ -1519,11 +1519,6 @@ int rawnand_sw_bch_correct(struct nand_chip *chip, unsigned char *buf, unsigned char *read_ecc, unsigned char *calc_ecc); void rawnand_sw_bch_cleanup(struct nand_chip *chip); -int nand_check_erased_ecc_chunk(void *data, int datalen, - void *ecc, int ecclen, - void *extraoob, int extraooblen, - int threshold); - int nand_ecc_choose_conf(struct nand_chip *chip, const struct nand_ecc_caps *caps, int oobavail); -- cgit v1.2.3 From a3d076b0567e729d5f21a95525c4d096b1f59e79 Mon Sep 17 00:00:00 2001 From: Akiva Goldberger Date: Wed, 17 Sep 2025 16:27:58 +0300 Subject: net/mlx5: Add uar access and odp page fault counters Add bar_uar_access, odp_local_triggered_page_fault, and odp_remote_triggered_page_fault counters to the query_vnic_env command. Additionally, add corresponding capabilities bits to the HCA CAP. Signed-off-by: Akiva Goldberger Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758115678-643464-1-git-send-email-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 097b1b7ada63..0cf187e13def 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1958,7 +1958,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_rqt[0x5]; u8 reserved_at_390[0x3]; u8 log_max_rqt_size[0x5]; - u8 reserved_at_398[0x3]; + u8 reserved_at_398[0x1]; + u8 vnic_env_cnt_bar_uar_access[0x1]; + u8 vnic_env_cnt_odp_page_fault[0x1]; u8 log_max_tis_per_sq[0x5]; u8 ext_stride_num_range[0x1]; @@ -4019,7 +4021,13 @@ struct mlx5_ifc_vnic_diagnostic_statistics_bits { u8 handled_pkt_steering_fail[0x40]; - u8 reserved_at_360[0xc80]; + u8 bar_uar_access[0x20]; + + u8 odp_local_triggered_page_fault[0x20]; + + u8 odp_remote_triggered_page_fault[0x20]; + + u8 reserved_at_3c0[0xc20]; }; struct mlx5_ifc_traffic_counter_bits { -- cgit v1.2.3 From 00c94ca2b99e6610e483f92e531b319eeaed94aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:29 -0700 Subject: psp: base PSP device support Add a netlink family for PSP and allow drivers to register support. The "PSP device" is its own object. This allows us to perform more flexible reference counting / lifetime control than if PSP information was part of net_device. In the future we should also be able to "delegate" PSP access to software devices, such as *vlan, veth or netkit more easily. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-3-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 96 +++++++++++++++ include/linux/netdevice.h | 4 + include/net/psp.h | 12 ++ include/net/psp/functions.h | 14 +++ include/net/psp/types.h | 100 ++++++++++++++++ include/uapi/linux/psp.h | 42 +++++++ net/Kconfig | 1 + net/Makefile | 1 + net/psp/Kconfig | 13 ++ net/psp/Makefile | 5 + net/psp/psp-nl-gen.c | 65 ++++++++++ net/psp/psp-nl-gen.h | 30 +++++ net/psp/psp.h | 31 +++++ net/psp/psp_main.c | 139 ++++++++++++++++++++++ net/psp/psp_nl.c | 223 +++++++++++++++++++++++++++++++++++ tools/net/ynl/Makefile.deps | 1 + 16 files changed, 777 insertions(+) create mode 100644 Documentation/netlink/specs/psp.yaml create mode 100644 include/net/psp.h create mode 100644 include/net/psp/functions.h create mode 100644 include/net/psp/types.h create mode 100644 include/uapi/linux/psp.h create mode 100644 net/psp/Kconfig create mode 100644 net/psp/Makefile create mode 100644 net/psp/psp-nl-gen.c create mode 100644 net/psp/psp-nl-gen.h create mode 100644 net/psp/psp.h create mode 100644 net/psp/psp_main.c create mode 100644 net/psp/psp_nl.c (limited to 'include/linux') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml new file mode 100644 index 000000000000..706f4baf8764 --- /dev/null +++ b/Documentation/netlink/specs/psp.yaml @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +--- +name: psp + +doc: + PSP Security Protocol Generic Netlink family. + +definitions: + - + type: enum + name: version + entries: [hdr0-aes-gcm-128, hdr0-aes-gcm-256, + hdr0-aes-gmac-128, hdr0-aes-gmac-256] + +attribute-sets: + - + name: dev + attributes: + - + name: id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: ifindex + doc: ifindex of the main netdevice linked to the PSP device. + type: u32 + - + name: psp-versions-cap + doc: Bitmask of PSP versions supported by the device. + type: u32 + enum: version + enum-as-flags: true + - + name: psp-versions-ena + doc: Bitmask of currently enabled (accepted on Rx) PSP versions. + type: u32 + enum: version + enum-as-flags: true + +operations: + list: + - + name: dev-get + doc: Get / dump information about PSP capable devices on the system. + attribute-set: dev + do: + request: + attributes: + - id + reply: &dev-all + attributes: + - id + - ifindex + - psp-versions-cap + - psp-versions-ena + pre: psp-device-get-locked + post: psp-device-unlock + dump: + reply: *dev-all + - + name: dev-add-ntf + doc: Notification about device appearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-del-ntf + doc: Notification about device disappearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-set + doc: Set the configuration of a PSP device. + attribute-set: dev + do: + request: + attributes: + - id + - psp-versions-ena + reply: + attributes: [] + pre: psp-device-get-locked + post: psp-device-unlock + - + name: dev-change-ntf + doc: Notification about device configuration being changed. + notify: dev-get + mcgrp: mgmt + +mcast-groups: + list: + - + name: mgmt + +... diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f5a840c07cf1..1c54d44805fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1906,6 +1906,7 @@ enum netdev_reg_state { * device struct * @mpls_ptr: mpls_dev struct pointer * @mctp_ptr: MCTP specific data + * @psp_dev: PSP crypto device registered for this netdev * * @dev_addr: Hw address (before bcast, * because most packets are unicast) @@ -2310,6 +2311,9 @@ struct net_device { #if IS_ENABLED(CONFIG_MCTP) struct mctp_dev __rcu *mctp_ptr; #endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_dev __rcu *psp_dev; +#endif /* * Cache lines mostly used on receive path (including eth_type_trans()) diff --git a/include/net/psp.h b/include/net/psp.h new file mode 100644 index 000000000000..33bb4d1dc46e --- /dev/null +++ b/include/net/psp.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_ALL_H +#define __NET_PSP_ALL_H + +#include +#include +#include + +/* Do not add any code here. Put it in the sub-headers instead. */ + +#endif /* __NET_PSP_ALL_H */ diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h new file mode 100644 index 000000000000..074f9df9afc3 --- /dev/null +++ b/include/net/psp/functions.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_HELPERS_H +#define __NET_PSP_HELPERS_H + +#include + +/* Driver-facing API */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, + struct psp_dev_caps *psd_caps, void *priv_ptr); +void psp_dev_unregister(struct psp_dev *psd); + +#endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h new file mode 100644 index 000000000000..d242b1ecee7d --- /dev/null +++ b/include/net/psp/types.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_H +#define __NET_PSP_H + +#include +#include + +struct netlink_ext_ack; + +#define PSP_DEFAULT_UDP_PORT 1000 + +struct psphdr { + u8 nexthdr; + u8 hdrlen; + u8 crypt_offset; + u8 verfl; + __be32 spi; + __be64 iv; + __be64 vc[]; /* optional */ +}; + +#define PSP_SPI_KEY_ID GENMASK(30, 0) +#define PSP_SPI_KEY_PHASE BIT(31) + +#define PSPHDR_CRYPT_OFFSET GENMASK(5, 0) + +#define PSPHDR_VERFL_SAMPLE BIT(7) +#define PSPHDR_VERFL_DROP BIT(6) +#define PSPHDR_VERFL_VERSION GENMASK(5, 2) +#define PSPHDR_VERFL_VIRT BIT(1) +#define PSPHDR_VERFL_ONE BIT(0) + +#define PSP_HDRLEN_NOOPT ((sizeof(struct psphdr) - 8) / 8) + +/** + * struct psp_dev_config - PSP device configuration + * @versions: PSP versions enabled on the device + */ +struct psp_dev_config { + u32 versions; +}; + +/** + * struct psp_dev - PSP device struct + * @main_netdev: original netdevice of this PSP device + * @ops: driver callbacks + * @caps: device capabilities + * @drv_priv: driver priv pointer + * @lock: instance lock, protects all fields + * @refcnt: reference count for the instance + * @id: instance id + * @config: current device configuration + * + * @rcu: RCU head for freeing the structure + */ +struct psp_dev { + struct net_device *main_netdev; + + struct psp_dev_ops *ops; + struct psp_dev_caps *caps; + void *drv_priv; + + struct mutex lock; + refcount_t refcnt; + + u32 id; + + struct psp_dev_config config; + + struct rcu_head rcu; +}; + +/** + * struct psp_dev_caps - PSP device capabilities + */ +struct psp_dev_caps { + /** + * @versions: mask of supported PSP versions + * Set this field to 0 to indicate PSP is not supported at all. + */ + u32 versions; +}; + +#define PSP_MAX_KEY 32 + +/** + * struct psp_dev_ops - netdev driver facing PSP callbacks + */ +struct psp_dev_ops { + /** + * @set_config: set configuration of a PSP device + * Driver can inspect @psd->config for the previous configuration. + * Core will update @psd->config with @config on success. + */ + int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, + struct netlink_ext_ack *extack); +}; + +#endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h new file mode 100644 index 000000000000..4a404f085190 --- /dev/null +++ b/include/uapi/linux/psp.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_PSP_H +#define _UAPI_LINUX_PSP_H + +#define PSP_FAMILY_NAME "psp" +#define PSP_FAMILY_VERSION 1 + +enum psp_version { + PSP_VERSION_HDR0_AES_GCM_128, + PSP_VERSION_HDR0_AES_GCM_256, + PSP_VERSION_HDR0_AES_GMAC_128, + PSP_VERSION_HDR0_AES_GMAC_256, +}; + +enum { + PSP_A_DEV_ID = 1, + PSP_A_DEV_IFINDEX, + PSP_A_DEV_PSP_VERSIONS_CAP, + PSP_A_DEV_PSP_VERSIONS_ENA, + + __PSP_A_DEV_MAX, + PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) +}; + +enum { + PSP_CMD_DEV_GET = 1, + PSP_CMD_DEV_ADD_NTF, + PSP_CMD_DEV_DEL_NTF, + PSP_CMD_DEV_SET, + PSP_CMD_DEV_CHANGE_NTF, + + __PSP_CMD_MAX, + PSP_CMD_MAX = (__PSP_CMD_MAX - 1) +}; + +#define PSP_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/Kconfig b/net/Kconfig index d5865cf19799..4b563aea4c23 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -82,6 +82,7 @@ config NET_CRC32C menu "Networking options" source "net/packet/Kconfig" +source "net/psp/Kconfig" source "net/unix/Kconfig" source "net/tls/Kconfig" source "net/xfrm/Kconfig" diff --git a/net/Makefile b/net/Makefile index aac960c41db6..90e3d72bf58b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_INET_PSP) += psp/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/psp/Kconfig b/net/psp/Kconfig new file mode 100644 index 000000000000..55f9dd87446b --- /dev/null +++ b/net/psp/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# PSP configuration +# +config INET_PSP + bool "PSP Security Protocol support" + depends on INET + help + Enable kernel support for the PSP protocol. + For more information see: + https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf + + If unsure, say N. diff --git a/net/psp/Makefile b/net/psp/Makefile new file mode 100644 index 000000000000..41b51d06e560 --- /dev/null +++ b/net/psp/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_INET_PSP) += psp.o + +psp-y := psp_main.o psp_nl.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c new file mode 100644 index 000000000000..859712e7c2c1 --- /dev/null +++ b/net/psp/psp-nl-gen.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "psp-nl-gen.h" + +#include + +/* PSP_CMD_DEV_GET - do */ +static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_DEV_SET - do */ +static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), +}; + +/* Ops table for psp */ +static const struct genl_split_ops psp_nl_ops[] = { + { + .cmd = PSP_CMD_DEV_GET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_get_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_get_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_DEV_GET, + .dumpit = psp_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = PSP_CMD_DEV_SET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_set_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_set_nl_policy, + .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group psp_nl_mcgrps[] = { + [PSP_NLGRP_MGMT] = { "mgmt", }, +}; + +struct genl_family psp_nl_family __ro_after_init = { + .name = PSP_FAMILY_NAME, + .version = PSP_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = psp_nl_ops, + .n_split_ops = ARRAY_SIZE(psp_nl_ops), + .mcgrps = psp_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(psp_nl_mcgrps), +}; diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h new file mode 100644 index 000000000000..a099686cab5d --- /dev/null +++ b/net/psp/psp-nl-gen.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_PSP_GEN_H +#define _LINUX_PSP_GEN_H + +#include +#include + +#include + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + PSP_NLGRP_MGMT, +}; + +extern struct genl_family psp_nl_family; + +#endif /* _LINUX_PSP_GEN_H */ diff --git a/net/psp/psp.h b/net/psp/psp.h new file mode 100644 index 000000000000..94d0cc31a61f --- /dev/null +++ b/net/psp/psp.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PSP_PSP_H +#define __PSP_PSP_H + +#include +#include +#include +#include +#include + +extern struct xarray psp_devs; +extern struct mutex psp_devs_lock; + +void psp_dev_destroy(struct psp_dev *psd); +int psp_dev_check_access(struct psp_dev *psd, struct net *net); + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); + +static inline void psp_dev_get(struct psp_dev *psd) +{ + refcount_inc(&psd->refcnt); +} + +static inline void psp_dev_put(struct psp_dev *psd) +{ + if (refcount_dec_and_test(&psd->refcnt)) + psp_dev_destroy(psd); +} + +#endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c new file mode 100644 index 000000000000..e09499b7b14a --- /dev/null +++ b/net/psp/psp_main.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp.h" +#include "psp-nl-gen.h" + +DEFINE_XARRAY_ALLOC1(psp_devs); +struct mutex psp_devs_lock; + +/** + * DOC: PSP locking + * + * psp_devs_lock protects the psp_devs xarray. + * Ordering is take the psp_devs_lock and then the instance lock. + * Each instance is protected by RCU, and has a refcount. + * When driver unregisters the instance gets flushed, but struct sticks around. + */ + +/** + * psp_dev_check_access() - check if user in a given net ns can access PSP dev + * @psd: PSP device structure user is trying to access + * @net: net namespace user is in + * + * Return: 0 if PSP device should be visible in @net, errno otherwise. + */ +int psp_dev_check_access(struct psp_dev *psd, struct net *net) +{ + if (dev_net(psd->main_netdev) == net) + return 0; + return -ENOENT; +} + +/** + * psp_dev_create() - create and register PSP device + * @netdev: main netdevice + * @psd_ops: driver callbacks + * @psd_caps: device capabilities + * @priv_ptr: back-pointer to driver private data + * + * Return: pointer to allocated PSP device, or ERR_PTR. + */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, + struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, + void *priv_ptr) +{ + struct psp_dev *psd; + static u32 last_id; + int err; + + if (WARN_ON(!psd_caps->versions || + !psd_ops->set_config)) + return ERR_PTR(-EINVAL); + + psd = kzalloc(sizeof(*psd), GFP_KERNEL); + if (!psd) + return ERR_PTR(-ENOMEM); + + psd->main_netdev = netdev; + psd->ops = psd_ops; + psd->caps = psd_caps; + psd->drv_priv = priv_ptr; + + mutex_init(&psd->lock); + refcount_set(&psd->refcnt, 1); + + mutex_lock(&psp_devs_lock); + err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b, + &last_id, GFP_KERNEL); + if (err) { + mutex_unlock(&psp_devs_lock); + kfree(psd); + return ERR_PTR(err); + } + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF); + + rcu_assign_pointer(netdev->psp_dev, psd); + + mutex_unlock(&psd->lock); + + return psd; +} +EXPORT_SYMBOL(psp_dev_create); + +void psp_dev_destroy(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + xa_erase(&psp_devs, psd->id); + mutex_unlock(&psp_devs_lock); + + mutex_destroy(&psd->lock); + kfree_rcu(psd, rcu); +} + +/** + * psp_dev_unregister() - unregister PSP device + * @psd: PSP device structure + */ +void psp_dev_unregister(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + mutex_lock(&psd->lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); + + /* Wait until psp_dev_destroy() to call xa_erase() to prevent a + * different psd from being added to the xarray with this id, while + * there are still references to this psd being held. + */ + xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); + mutex_unlock(&psp_devs_lock); + + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); + + psd->ops = NULL; + psd->drv_priv = NULL; + + mutex_unlock(&psd->lock); + + psp_dev_put(psd); +} +EXPORT_SYMBOL(psp_dev_unregister); + +static int __init psp_init(void) +{ + mutex_init(&psp_devs_lock); + + return genl_register_family(&psp_nl_family); +} + +subsys_initcall(psp_init); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c new file mode 100644 index 000000000000..fda5ce800f82 --- /dev/null +++ b/net/psp/psp_nl.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp-nl-gen.h" +#include "psp.h" + +/* Netlink helpers */ + +static struct sk_buff *psp_nl_reply_new(struct genl_info *info) +{ + struct sk_buff *rsp; + void *hdr; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return NULL; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + nlmsg_free(rsp); + return NULL; + } + + return rsp; +} + +static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info) +{ + /* Note that this *only* works with a single message per skb! */ + nlmsg_end(rsp, (struct nlmsghdr *)rsp->data); + + return genlmsg_reply(rsp, info); +} + +/* Device stuff */ + +static struct psp_dev * +psp_device_get_and_lock(struct net *net, struct nlattr *dev_id) +{ + struct psp_dev *psd; + int err; + + mutex_lock(&psp_devs_lock); + psd = xa_load(&psp_devs, nla_get_u32(dev_id)); + if (!psd) { + mutex_unlock(&psp_devs_lock); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + err = psp_dev_check_access(psd, net); + if (err) { + mutex_unlock(&psd->lock); + return ERR_PTR(err); + } + + return psd; +} + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID)) + return -EINVAL; + + info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info), + info->attrs[PSP_A_DEV_ID]); + return PTR_ERR_OR_ZERO(info->user_ptr[0]); +} + +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + + mutex_unlock(&psd->lock); +} + +static int +psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev), + PSP_NLGRP_MGMT)) + return; + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + genl_info_init_ntf(&info, &psp_nl_family, cmd); + if (psp_nl_dev_fill(psd, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_MGMT, GFP_KERNEL); +} + +int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_dev_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_dev_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} + +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct psp_dev_config new_config; + struct sk_buff *rsp; + int err; + + memcpy(&new_config, &psd->config, sizeof(new_config)); + + if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) { + new_config.versions = + nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]); + if (new_config.versions & ~psd->caps->versions) { + NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(info->extack, "No settings present"); + return -EINVAL; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + if (memcmp(&new_config, &psd->config, sizeof(new_config))) { + err = psd->ops->set_config(psd, &new_config, info->extack); + if (err) + goto err_free_rsp; + + memcpy(&psd->config, &new_config, sizeof(new_config)); + } + + psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF); + + return psp_nl_reply_send(rsp, info); + +err_free_rsp: + nlmsg_free(rsp); + return err; +} diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 90686e241157..865fd2e8519e 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -31,6 +31,7 @@ CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN_H,ovpn.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) +CFLAGS_psp:=$(call get_hdr_inc,_LINUX_PSP_H,psp.h) CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ $(call get_hdr_inc,__LINUX_IF_ADDR_H,if_addr.h) CFLAGS_rt-link:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ -- cgit v1.2.3 From ed8a507b748336902525aa79e3573552534e8b3e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:30 -0700 Subject: net: modify core data structures for PSP datapath support Add pointers to psp data structures to core networking structs, and an SKB extension to carry the PSP information from the drivers to the socket layer. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-4-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 3 +++ include/net/inet_timewait_sock.h | 3 +++ include/net/psp/functions.h | 6 ++++++ include/net/psp/types.h | 7 +++++++ include/net/sock.h | 4 ++++ net/core/skbuff.c | 4 ++++ net/ipv4/af_inet.c | 2 ++ net/ipv4/tcp_minisocks.c | 2 ++ 8 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 62e7addccdf6..78ecfa7d00d0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4901,6 +4901,9 @@ enum skb_ext_id { #endif #if IS_ENABLED(CONFIG_MCTP_FLOWS) SKB_EXT_MCTP, +#endif +#if IS_ENABLED(CONFIG_INET_PSP) + SKB_EXT_PSP, #endif SKB_EXT_NUM, /* must be last */ }; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 67a313575780..c1295246216c 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -81,6 +81,9 @@ struct inet_timewait_sock { struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; +#endif }; #define tw_tclass tw_tos diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 074f9df9afc3..d0043bd14299 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -5,10 +5,16 @@ #include +struct inet_timewait_sock; + /* Driver-facing API */ struct psp_dev * psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, void *priv_ptr); void psp_dev_unregister(struct psp_dev *psd); +/* Kernel-facing API */ +static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } + #endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h index d242b1ecee7d..4922fc8d42fd 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -84,6 +84,13 @@ struct psp_dev_caps { #define PSP_MAX_KEY 32 +struct psp_skb_ext { + __be32 spi; + u16 dev_id; + u8 generation; + u8 version; +}; + /** * struct psp_dev_ops - netdev driver facing PSP callbacks */ diff --git a/include/net/sock.h b/include/net/sock.h index 0fd465935334..d1d3d36e39ae 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -249,6 +249,7 @@ struct sk_filter; * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy + * @psp_assoc: PSP association, if socket is PSP-secured * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -450,6 +451,9 @@ struct sock { #endif #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; +#endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; #endif struct numa_drop_counters *sk_drop_counters; __cacheline_group_end(sock_read_rxtx); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 23b776cd9879..d331e607edfb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -5062,6 +5063,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif +#if IS_ENABLED(CONFIG_INET_PSP) + [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 76e38092cd8a..e298dacb4a06 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #include @@ -158,6 +159,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); + psp_sk_assoc_free(sk); } EXPORT_SYMBOL(inet_sock_destruct); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 327095ef95ef..ddb67015ba28 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -24,6 +24,7 @@ #include #include #include +#include static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -392,6 +393,7 @@ void tcp_twsk_destructor(struct sock *sk) } #endif tcp_ao_destroy_sock(sk, true); + psp_twsk_assoc_free(inet_twsk(sk)); } void tcp_twsk_purge(struct list_head *net_exit_list) -- cgit v1.2.3 From 6684b91d04b408843bd65e2120f6db2159e4f40b Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 16 Sep 2025 21:08:38 -0700 Subject: bnxt_en: Implement ethtool .get_tunable() for ETHTOOL_PFC_PREVENTION_TOUT Return the current PFC watchdog timeout value if it is supported. Reviewed-by: Andy Gospodarek Reviewed-by: Somnath Kotur Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250917040839.1924698-10-michael.chan@broadcom.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 18 ++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 21 ++++++++++++ include/linux/bnxt/hsi.h | 40 +++++++++++++++++++++++ 4 files changed, 81 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c49a4755a94d..d59612d1e176 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -14748,6 +14748,23 @@ static bool bnxt_fw_pre_resv_vnics(struct bnxt *bp) return false; } +static void bnxt_hwrm_pfcwd_qcaps(struct bnxt *bp) +{ + struct hwrm_queue_pfcwd_timeout_qcaps_output *resp; + struct hwrm_queue_pfcwd_timeout_qcaps_input *req; + int rc; + + bp->max_pfcwd_tmo_ms = 0; + rc = hwrm_req_init(bp, req, HWRM_QUEUE_PFCWD_TIMEOUT_QCAPS); + if (rc) + return; + resp = hwrm_req_hold(bp, req); + rc = hwrm_req_send_silent(bp, req); + if (!rc) + bp->max_pfcwd_tmo_ms = le16_to_cpu(resp->max_pfcwd_timeout); + hwrm_req_drop(bp, req); +} + static int bnxt_fw_init_one_p1(struct bnxt *bp) { int rc; @@ -14825,6 +14842,7 @@ static int bnxt_fw_init_one_p2(struct bnxt *bp) if (bnxt_fw_pre_resv_vnics(bp)) bp->fw_cap |= BNXT_FW_CAP_PRE_RESV_VNICS; + bnxt_hwrm_pfcwd_qcaps(bp); bnxt_hwrm_func_qcfg(bp); bnxt_hwrm_vnic_qcaps(bp); bnxt_hwrm_port_led_qcaps(bp); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index c0f46eaf91c0..06a4c2afdf8a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2426,6 +2426,8 @@ struct bnxt { u8 max_q; u8 num_tc; + u16 max_pfcwd_tmo_ms; + u8 tph_mode; unsigned int current_interval; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 2830a2b17a27..d94a8a2bf0f9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4399,6 +4399,23 @@ static int bnxt_get_eee(struct net_device *dev, struct ethtool_keee *edata) return 0; } +static int bnxt_hwrm_pfcwd_qcfg(struct bnxt *bp, u16 *val) +{ + struct hwrm_queue_pfcwd_timeout_qcfg_output *resp; + struct hwrm_queue_pfcwd_timeout_qcfg_input *req; + int rc; + + rc = hwrm_req_init(bp, req, HWRM_QUEUE_PFCWD_TIMEOUT_QCFG); + if (rc) + return rc; + resp = hwrm_req_hold(bp, req); + rc = hwrm_req_send(bp, req); + if (!rc) + *val = le16_to_cpu(resp->pfcwd_timeout_value); + hwrm_req_drop(bp, req); + return rc; +} + static int bnxt_set_tunable(struct net_device *dev, const struct ethtool_tunable *tuna, const void *data) @@ -4431,6 +4448,10 @@ static int bnxt_get_tunable(struct net_device *dev, case ETHTOOL_RX_COPYBREAK: *(u32 *)data = bp->rx_copybreak; break; + case ETHTOOL_PFC_PREVENTION_TOUT: + if (!bp->max_pfcwd_tmo_ms) + return -EOPNOTSUPP; + return bnxt_hwrm_pfcwd_qcfg(bp, data); default: return -EOPNOTSUPP; } diff --git a/include/linux/bnxt/hsi.h b/include/linux/bnxt/hsi.h index 8c5dac3b3ef3..23e7b1290a92 100644 --- a/include/linux/bnxt/hsi.h +++ b/include/linux/bnxt/hsi.h @@ -6751,6 +6751,46 @@ struct hwrm_queue_dscp2pri_cfg_output { u8 valid; }; +/* hwrm_queue_pfcwd_timeout_qcaps_input (size:128b/16B) */ +struct hwrm_queue_pfcwd_timeout_qcaps_input { + __le16 req_type; + __le16 cmpl_ring; + __le16 seq_id; + __le16 target_id; + __le64 resp_addr; +}; + +/* hwrm_queue_pfcwd_timeout_qcaps_output (size:128b/16B) */ +struct hwrm_queue_pfcwd_timeout_qcaps_output { + __le16 error_code; + __le16 req_type; + __le16 seq_id; + __le16 resp_len; + __le16 max_pfcwd_timeout; + u8 unused_0[5]; + u8 valid; +}; + +/* hwrm_queue_pfcwd_timeout_qcfg_input (size:128b/16B) */ +struct hwrm_queue_pfcwd_timeout_qcfg_input { + __le16 req_type; + __le16 cmpl_ring; + __le16 seq_id; + __le16 target_id; + __le64 resp_addr; +}; + +/* hwrm_queue_pfcwd_timeout_qcfg_output (size:128b/16B) */ +struct hwrm_queue_pfcwd_timeout_qcfg_output { + __le16 error_code; + __le16 req_type; + __le16 seq_id; + __le16 resp_len; + __le16 pfcwd_timeout_value; + u8 unused_0[5]; + u8 valid; +}; + /* hwrm_vnic_alloc_input (size:192b/24B) */ struct hwrm_vnic_alloc_input { __le16 req_type; -- cgit v1.2.3 From fa18932afb29b51530508f28adcc824a8c00b712 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 16 Sep 2025 21:08:39 -0700 Subject: bnxt_en: Implement ethtool .set_tunable() for ETHTOOL_PFC_PREVENTION_TOUT Support the setting of the tunable if it is supported by firmware. The supported range is 0 to the maximum msec value reported by firmware. PFC_STORM_PREVENTION_AUTO is also supported and 0 means it is disabled. Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250917040839.1924698-11-michael.chan@broadcom.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 24 ++++++++++++++++++++++- include/linux/bnxt/hsi.h | 21 ++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index d94a8a2bf0f9..be32ef8f5c96 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4416,12 +4416,25 @@ static int bnxt_hwrm_pfcwd_qcfg(struct bnxt *bp, u16 *val) return rc; } +static int bnxt_hwrm_pfcwd_cfg(struct bnxt *bp, u16 val) +{ + struct hwrm_queue_pfcwd_timeout_cfg_input *req; + int rc; + + rc = hwrm_req_init(bp, req, HWRM_QUEUE_PFCWD_TIMEOUT_CFG); + if (rc) + return rc; + req->pfcwd_timeout_value = cpu_to_le16(val); + rc = hwrm_req_send(bp, req); + return rc; +} + static int bnxt_set_tunable(struct net_device *dev, const struct ethtool_tunable *tuna, const void *data) { struct bnxt *bp = netdev_priv(dev); - u32 rx_copybreak; + u32 rx_copybreak, val; switch (tuna->id) { case ETHTOOL_RX_COPYBREAK: @@ -4434,6 +4447,15 @@ static int bnxt_set_tunable(struct net_device *dev, bp->rx_copybreak = rx_copybreak; } return 0; + case ETHTOOL_PFC_PREVENTION_TOUT: + if (BNXT_VF(bp) || !bp->max_pfcwd_tmo_ms) + return -EOPNOTSUPP; + + val = *(u16 *)data; + if (val > bp->max_pfcwd_tmo_ms && + val != PFC_STORM_PREVENTION_AUTO) + return -EINVAL; + return bnxt_hwrm_pfcwd_cfg(bp, val); default: return -EOPNOTSUPP; } diff --git a/include/linux/bnxt/hsi.h b/include/linux/bnxt/hsi.h index 23e7b1290a92..47c34990cf23 100644 --- a/include/linux/bnxt/hsi.h +++ b/include/linux/bnxt/hsi.h @@ -6771,6 +6771,27 @@ struct hwrm_queue_pfcwd_timeout_qcaps_output { u8 valid; }; +/* hwrm_queue_pfcwd_timeout_cfg_input (size:192b/24B) */ +struct hwrm_queue_pfcwd_timeout_cfg_input { + __le16 req_type; + __le16 cmpl_ring; + __le16 seq_id; + __le16 target_id; + __le64 resp_addr; + __le16 pfcwd_timeout_value; + u8 unused_0[6]; +}; + +/* hwrm_queue_pfcwd_timeout_cfg_output (size:128b/16B) */ +struct hwrm_queue_pfcwd_timeout_cfg_output { + __le16 error_code; + __le16 req_type; + __le16 seq_id; + __le16 resp_len; + u8 unused_0[7]; + u8 valid; +}; + /* hwrm_queue_pfcwd_timeout_qcfg_input (size:128b/16B) */ struct hwrm_queue_pfcwd_timeout_qcfg_input { __le16 req_type; -- cgit v1.2.3 From f72e2cff13aefe305fc8fc6afe4f43626e4ad88c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 16 Sep 2025 15:48:01 +0200 Subject: compiler_types: Add __assume macro Make the statement attribute "assume" with a new __assume macro available. The assume attribute is used to indicate that a certain condition is assumed to be true. Compilers may or may not use this indication to generate optimized code. If this condition is violated at runtime, the behavior is undefined. Note that the clang documentation states that optimizers may react differently to this attribute, and this may even have a negative performance impact. Therefore this attribute should be used with care. Signed-off-by: Heiko Carstens Reviewed-by: Nathan Chancellor Signed-off-by: Alexander Gordeev --- include/linux/compiler_types.h | 23 +++++++++++++++++++++++ init/Kconfig | 10 ++++++++++ 2 files changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 16755431fc11..2f3e80bf9f35 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -329,6 +329,29 @@ struct ftrace_likely_data { #define __no_sanitize_or_inline __always_inline #endif +/* + * The assume attribute is used to indicate that a certain condition is + * assumed to be true. If this condition is violated at runtime, the behavior + * is undefined. Compilers may or may not use this indication to generate + * optimized code. + * + * Note that the clang documentation states that optimizers may react + * differently to this attribute, and this may even have a negative + * performance impact. Therefore this attribute should be used with care. + * + * Optional: only supported since gcc >= 13 + * Optional: only supported since clang >= 19 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Statement-Attributes.html#index-assume-statement-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#id13 + * + */ +#ifdef CONFIG_CC_HAS_ASSUME +# define __assume(expr) __attribute__((__assume__(expr))) +#else +# define __assume(expr) +#endif + /* * Optional: only supported since gcc >= 15 * Optional: only supported since clang >= 18 diff --git a/init/Kconfig b/init/Kconfig index 836320251219..dabec90f18f6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -112,6 +112,16 @@ config TOOLS_SUPPORT_RELR config CC_HAS_ASM_INLINE def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null) +config CC_HAS_ASSUME + bool + # clang needs to be at least 19.1.0 since the meaning of the assume + # attribute changed: + # https://github.com/llvm/llvm-project/commit/c44fa3e8a9a44c2e9a575768a3c185354b9f6c17 + default y if CC_IS_CLANG && CLANG_VERSION >= 190100 + # supported since gcc 13.1.0 + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106654 + default y if CC_IS_GCC && GCC_VERSION >= 130100 + config CC_HAS_NO_PROFILE_FN_ATTR def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror) -- cgit v1.2.3 From 84eaf4359c36b0ba888f571a964138d22ba5914f Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 17 Sep 2025 02:58:11 -0700 Subject: net: ethtool: add get_rx_ring_count callback to optimize RX ring queries Add a new optional get_rx_ring_count callback in ethtool_ops to allow drivers to provide the number of RX rings directly without going through the full get_rxnfc flow classification interface. Create ethtool_get_rx_ring_count() to use .get_rx_ring_count if available, falling back to get_rxnfc() otherwise. It needs to be non-static, given it will be called by other ethtool functions laters, as those calling get_rxfh(). Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250917-gxrings-v4-4-dae520e2e1cb@debian.org Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 ++ net/ethtool/common.c | 20 ++++++++++++++++++++ net/ethtool/common.h | 2 ++ net/ethtool/ioctl.c | 8 +++----- 4 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index d7d757e72554..c869b7f8bce8 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -968,6 +968,7 @@ struct kernel_ethtool_ts_info { * @reset: Reset (part of) the device, as specified by a bitmask of * flags from &enum ethtool_reset_flags. Returns a negative * error code or zero. + * @get_rx_ring_count: Return the number of RX rings * @get_rxfh_key_size: Get the size of the RX flow hash key. * Returns zero if not supported for this specific device. * @get_rxfh_indir_size: Get the size of the RX flow hash indirection table. @@ -1162,6 +1163,7 @@ struct ethtool_ops { int (*set_rxnfc)(struct net_device *, struct ethtool_rxnfc *); int (*flash_device)(struct net_device *, struct ethtool_flash *); int (*reset)(struct net_device *, u32 *); + u32 (*get_rx_ring_count)(struct net_device *dev); u32 (*get_rxfh_key_size)(struct net_device *); u32 (*get_rxfh_indir_size)(struct net_device *); int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 4f58648a27ad..10460ea3717c 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -577,6 +577,26 @@ int __ethtool_get_link(struct net_device *dev) return netif_running(dev) && dev->ethtool_ops->get_link(dev); } +int ethtool_get_rx_ring_count(struct net_device *dev) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc rx_rings = {}; + int ret; + + if (ops->get_rx_ring_count) + return ops->get_rx_ring_count(dev); + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret < 0) + return ret; + + return rx_rings.data; +} + static int ethtool_get_rxnfc_rule_count(struct net_device *dev) { const struct ethtool_ops *ops = dev->ethtool_ops; diff --git a/net/ethtool/common.h b/net/ethtool/common.h index c4d084dde5bf..1609cf4e53eb 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -54,6 +54,8 @@ void ethtool_ringparam_get_cfg(struct net_device *dev, struct kernel_ethtool_ringparam *kparam, struct netlink_ext_ack *extack); +int ethtool_get_rx_ring_count(struct net_device *dev); + int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); int ethtool_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index a0f3de76cea0..8493ee200601 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1212,23 +1212,21 @@ static noinline_for_stack int ethtool_get_rxrings(struct net_device *dev, u32 cmd, void __user *useraddr) { - const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxnfc info; size_t info_size; int ret; - if (!ops->get_rxnfc) - return -EOPNOTSUPP; - info_size = sizeof(info); ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (ret) return ret; - ret = ops->get_rxnfc(dev, &info, NULL); + ret = ethtool_get_rx_ring_count(dev); if (ret < 0) return ret; + info.data = ret; + return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); } -- cgit v1.2.3 From 8d5b7009aabc27e626e4167fedf1e1c1c3d6b143 Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Fri, 5 Sep 2025 21:19:45 +0530 Subject: mei: bus: add mei_cldev_mtu interface Add a new helper function that allows MEI client drivers to query the maximum transmission unit (MTU) for a connected MEI client. This is useful for clients that need to transmit large payloads, such as firmware blobs, allowing them to determine the maximum message size that can be safely sent before starting transmission and size of the buffer to allocate when receiving data. Reviewed-by: Mika Westerberg Signed-off-by: Alexander Usyskin Signed-off-by: Badal Nilawar Reviewed-by: Umesh Nerlige Ramappa Signed-off-by: Rodrigo Vivi Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250905154953.3974335-2-badal.nilawar@intel.com Signed-off-by: Lucas De Marchi --- drivers/misc/mei/bus.c | 13 +++++++++++++ include/linux/mei_cl_bus.h | 1 + 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c index 5cc3ad07d5be..09aae8f9d225 100644 --- a/drivers/misc/mei/bus.c +++ b/drivers/misc/mei/bus.c @@ -614,6 +614,19 @@ u8 mei_cldev_ver(const struct mei_cl_device *cldev) } EXPORT_SYMBOL_GPL(mei_cldev_ver); +/** + * mei_cldev_mtu - max message that client can send and receive + * + * @cldev: mei client device + * + * Return: mtu or 0 if client is not connected + */ +size_t mei_cldev_mtu(const struct mei_cl_device *cldev) +{ + return mei_cl_mtu(cldev->cl); +} +EXPORT_SYMBOL_GPL(mei_cldev_mtu); + /** * mei_cldev_enabled - check whether the device is enabled * diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h index 725fd7727422..a82755e1fc40 100644 --- a/include/linux/mei_cl_bus.h +++ b/include/linux/mei_cl_bus.h @@ -113,6 +113,7 @@ int mei_cldev_register_notif_cb(struct mei_cl_device *cldev, mei_cldev_cb_t notif_cb); u8 mei_cldev_ver(const struct mei_cl_device *cldev); +size_t mei_cldev_mtu(const struct mei_cl_device *cldev); void *mei_cldev_get_drvdata(const struct mei_cl_device *cldev); void mei_cldev_set_drvdata(struct mei_cl_device *cldev, void *data); -- cgit v1.2.3 From df8922afc37aa2111ca79a216653a629146763ad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 18 Sep 2025 13:59:15 -0600 Subject: io_uring/msg_ring: kill alloc_cache for io_kiocb allocations A recent commit: fc582cd26e88 ("io_uring/msg_ring: ensure io_kiocb freeing is deferred for RCU") fixed an issue with not deferring freeing of io_kiocb structs that msg_ring allocates to after the current RCU grace period. But this only covers requests that don't end up in the allocation cache. If a request goes into the alloc cache, it can get reused before it is sane to do so. A recent syzbot report would seem to indicate that there's something there, however it may very well just be because of the KASAN poisoning that the alloc_cache handles manually. Rather than attempt to make the alloc_cache sane for that use case, just drop the usage of the alloc_cache for msg_ring request payload data. Fixes: 50cf5f3842af ("io_uring/msg_ring: add an alloc cache for io_kiocb entries") Link: https://lore.kernel.org/io-uring/68cc2687.050a0220.139b6.0005.GAE@google.com/ Reported-by: syzbot+baa2e0f4e02df602583e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- io_uring/io_uring.c | 4 ---- io_uring/msg_ring.c | 24 ++---------------------- 3 files changed, 2 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 80a178f3d896..12f5ee43850e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -420,9 +420,6 @@ struct io_ring_ctx { struct list_head defer_list; unsigned nr_drained; - struct io_alloc_cache msg_cache; - spinlock_t msg_lock; - #ifdef CONFIG_NET_RX_BUSY_POLL struct list_head napi_list; /* track busy poll napi_id */ spinlock_t napi_lock; /* napi_list lock */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bcec12256f34..93665cebe9bd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -290,7 +290,6 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); - io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); } @@ -337,9 +336,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_cmd), sizeof(struct io_async_cmd)); - spin_lock_init(&ctx->msg_lock); - ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); ret |= io_rsrc_cache_init(ctx); if (ret) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 4c2578f2efcb..5e5b94236d72 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -11,7 +11,6 @@ #include "io_uring.h" #include "rsrc.h" #include "filetable.h" -#include "alloc_cache.h" #include "msg_ring.h" /* All valid masks for MSG_RING */ @@ -76,13 +75,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); - if (spin_trylock(&ctx->msg_lock)) { - if (io_alloc_cache_put(&ctx->msg_cache, req)) - req = NULL; - spin_unlock(&ctx->msg_lock); - } - if (req) - kfree_rcu(req, rcu_head); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } @@ -104,26 +97,13 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } -static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) -{ - struct io_kiocb *req = NULL; - - if (spin_trylock(&ctx->msg_lock)) { - req = io_alloc_cache_get(&ctx->msg_cache); - spin_unlock(&ctx->msg_lock); - if (req) - return req; - } - return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); -} - static int io_msg_data_remote(struct io_ring_ctx *target_ctx, struct io_msg *msg) { struct io_kiocb *target; u32 flags = 0; - target = io_msg_get_kiocb(target_ctx); + target = kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO) ; if (unlikely(!target)) return -ENOMEM; -- cgit v1.2.3 From a660194dd101e937c319171ad99c3fbe466fd825 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Wed, 17 Sep 2025 12:02:07 -0700 Subject: arm64: Enable permission change on arm64 kernel block mappings This patch paves the path to enable huge mappings in vmalloc space and linear map space by default on arm64. For this we must ensure that we can handle any permission games on the kernel (init_mm) pagetable. Previously, __change_memory_common() used apply_to_page_range() which does not support changing permissions for block mappings. We move away from this by using the pagewalk API, similar to what riscv does right now. It is the responsibility of the caller to ensure that the range over which permissions are being changed falls on leaf mapping boundaries. For systems with BBML2, this will be handled in future patches by dyanmically splitting the mappings when required. Unlike apply_to_page_range(), the pagewalk API currently enforces the init_mm.mmap_lock to be held. To avoid the unnecessary bottleneck of the mmap_lock for our usecase, this patch extends this generic API to be used locklessly, so as to retain the existing behaviour for changing permissions. Apart from this reason, it is noted at [1] that KFENCE can manipulate kernel pgtable entries during softirqs. It does this by calling set_memory_valid() -> __change_memory_common(). This being a non-sleepable context, we cannot take the init_mm mmap lock. Add comments to highlight the conditions under which we can use the lockless variant - no underlying VMA, and the user having exclusive control over the range, thus guaranteeing no concurrent access. We require that the start and end of a given range do not partially overlap block mappings, or cont mappings. Return -EINVAL in case a partial block mapping is detected in any of the PGD/P4D/PUD/PMD levels; add a corresponding comment in update_range_prot() to warn that eliminating such a condition is the responsibility of the caller. Note that, the pte level callback may change permissions for a whole contpte block, and that will be done one pte at a time, as opposed to an atomic operation for the block mappings. This is fine as any access will decode either the old or the new permission until the TLBI. apply_to_page_range() currently performs all pte level callbacks while in lazy mmu mode. Since arm64 can optimize performance by batching barriers when modifying kernel pgtables in lazy mmu mode, we would like to continue to benefit from this optimisation. Unfortunately walk_kernel_page_table_range() does not use lazy mmu mode. However, since the pagewalk framework is not allocating any memory, we can safely bracket the whole operation inside lazy mmu mode ourselves. Therefore, wrap the call to walk_kernel_page_table_range() with the lazy MMU helpers. Link: https://lore.kernel.org/linux-arm-kernel/89d0ad18-4772-4d8f-ae8a-7c48d26a927e@arm.com/ [1] Signed-off-by: Dev Jain Signed-off-by: Yang Shi Reviewed-by: Ryan Roberts Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/mm/pageattr.c | 119 +++++++++++++++++++++++++++++++++++------------ include/linux/pagewalk.h | 3 ++ mm/pagewalk.c | 36 +++++++++----- 3 files changed, 115 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 667aff1efe49..c0648764c403 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -20,6 +21,65 @@ struct page_change_data { pgprot_t clear_mask; }; +static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk) +{ + struct page_change_data *masks = walk->private; + + val &= ~(pgprot_val(masks->clear_mask)); + val |= (pgprot_val(masks->set_mask)); + + return val; +} + +static int pageattr_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t val = pudp_get(pud); + + if (pud_sect(val)) { + if (WARN_ON_ONCE((next - addr) != PUD_SIZE)) + return -EINVAL; + val = __pud(set_pageattr_masks(pud_val(val), walk)); + set_pud(pud, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t val = pmdp_get(pmd); + + if (pmd_sect(val)) { + if (WARN_ON_ONCE((next - addr) != PMD_SIZE)) + return -EINVAL; + val = __pmd(set_pageattr_masks(pmd_val(val), walk)); + set_pmd(pmd, val); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int pageattr_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t val = __ptep_get(pte); + + val = __pte(set_pageattr_masks(pte_val(val), walk)); + __set_pte(pte, val); + + return 0; +} + +static const struct mm_walk_ops pageattr_ops = { + .pud_entry = pageattr_pud_entry, + .pmd_entry = pageattr_pmd_entry, + .pte_entry = pageattr_pte_entry, +}; + bool rodata_full __ro_after_init = true; bool can_set_direct_map(void) @@ -37,32 +97,35 @@ bool can_set_direct_map(void) arm64_kfence_can_set_direct_map() || is_realm_world(); } -static int change_page_range(pte_t *ptep, unsigned long addr, void *data) +static int update_range_prot(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) { - struct page_change_data *cdata = data; - pte_t pte = __ptep_get(ptep); + struct page_change_data data; + int ret; - pte = clear_pte_bit(pte, cdata->clear_mask); - pte = set_pte_bit(pte, cdata->set_mask); + data.set_mask = set_mask; + data.clear_mask = clear_mask; - __set_pte(ptep, pte); - return 0; + arch_enter_lazy_mmu_mode(); + + /* + * The caller must ensure that the range we are operating on does not + * partially overlap a block mapping, or a cont mapping. Any such case + * must be eliminated by splitting the mapping. + */ + ret = walk_kernel_page_table_range_lockless(start, start + size, + &pageattr_ops, NULL, &data); + arch_leave_lazy_mmu_mode(); + + return ret; } -/* - * This function assumes that the range is mapped with PAGE_SIZE pages. - */ static int __change_memory_common(unsigned long start, unsigned long size, - pgprot_t set_mask, pgprot_t clear_mask) + pgprot_t set_mask, pgprot_t clear_mask) { - struct page_change_data data; int ret; - data.set_mask = set_mask; - data.clear_mask = clear_mask; - - ret = apply_to_page_range(&init_mm, start, size, change_page_range, - &data); + ret = update_range_prot(start, size, set_mask, clear_mask); /* * If the memory is being made valid without changing any other bits @@ -174,32 +237,26 @@ int set_memory_valid(unsigned long addr, int numpages, int enable) int set_direct_map_invalid_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(0), - .clear_mask = __pgprot(PTE_VALID), - }; + pgprot_t clear_mask = __pgprot(PTE_VALID); + pgprot_t set_mask = __pgprot(0); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } int set_direct_map_default_noflush(struct page *page) { - struct page_change_data data = { - .set_mask = __pgprot(PTE_VALID | PTE_WRITE), - .clear_mask = __pgprot(PTE_RDONLY), - }; + pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE); + pgprot_t clear_mask = __pgprot(PTE_RDONLY); if (!can_set_direct_map()) return 0; - return apply_to_page_range(&init_mm, - (unsigned long)page_address(page), - PAGE_SIZE, change_page_range, &data); + return update_range_prot((unsigned long)page_address(page), + PAGE_SIZE, set_mask, clear_mask); } static int __set_memory_enc_dec(unsigned long addr, diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 682472c15495..88e18615dd72 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -134,6 +134,9 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_kernel_page_table_range_lockless(unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 648038247a8d..936689d8bcac 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -606,10 +606,32 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, int walk_kernel_page_table_range(unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private) { - struct mm_struct *mm = &init_mm; + /* + * Kernel intermediate page tables are usually not freed, so the mmap + * read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + mmap_assert_locked(&init_mm); + + return walk_kernel_page_table_range_lockless(start, end, ops, pgd, + private); +} + +/* + * Use this function to walk the kernel page tables locklessly. It should be + * guaranteed that the caller has exclusive access over the range they are + * operating on - that there should be no concurrent access, for example, + * changing permissions for vmalloc objects. + */ +int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ struct mm_walk walk = { .ops = ops, - .mm = mm, + .mm = &init_mm, .pgd = pgd, .private = private, .no_vma = true @@ -620,16 +642,6 @@ int walk_kernel_page_table_range(unsigned long start, unsigned long end, if (!check_ops_valid(ops)) return -EINVAL; - /* - * Kernel intermediate page tables are usually not freed, so the mmap - * read lock is sufficient. But there are some exceptions. - * E.g. memory hot-remove. In which case, the mmap lock is insufficient - * to prevent the intermediate kernel pages tables belonging to the - * specified address range from being freed. The caller should take - * other actions to prevent this race. - */ - mmap_assert_locked(mm); - return walk_pgd_range(start, end, &walk); } -- cgit v1.2.3 From 76cffc3eb1bdee0a7e8cca090adfd46a740f1cb0 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:44 +0100 Subject: soundwire: bus: add of_sdw_find_device_by_node helper There has been more than 3 instances of this helper in multiple codec drivers, it does not make sense to keep duplicating this part of code. Lets add a helper of_sdw_find_device_by_node for codec drivers to use it. Signed-off-by: Srinivas Kandagatla Reviewed-by: Dmitry Baryshkov Acked-by: Vinod Koul Link: https://patch.msgid.link/20250909121954.225833-4-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- drivers/soundwire/slave.c | 6 ++++++ include/linux/soundwire/sdw.h | 9 +++++++++ 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/drivers/soundwire/slave.c b/drivers/soundwire/slave.c index d2d99555ec5a..3d4d00188c26 100644 --- a/drivers/soundwire/slave.c +++ b/drivers/soundwire/slave.c @@ -273,4 +273,10 @@ int sdw_of_find_slaves(struct sdw_bus *bus) return 0; } +struct device *of_sdw_find_device_by_node(struct device_node *np) +{ + return bus_find_device_by_of_node(&sdw_bus_type, np); +} +EXPORT_SYMBOL_GPL(of_sdw_find_device_by_node); + MODULE_IMPORT_NS("SND_SOC_SDCA"); diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 0832776262ac..096213956d31 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -19,6 +19,7 @@ struct dentry; struct fwnode_handle; +struct device_node; struct sdw_bus; struct sdw_slave; @@ -1086,6 +1087,8 @@ int sdw_stream_add_slave(struct sdw_slave *slave, int sdw_stream_remove_slave(struct sdw_slave *slave, struct sdw_stream_runtime *stream); +struct device *of_sdw_find_device_by_node(struct device_node *np); + int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base); /* messaging and data APIs */ @@ -1119,6 +1122,12 @@ static inline int sdw_stream_remove_slave(struct sdw_slave *slave, return -EINVAL; } +static inline struct device *of_sdw_find_device_by_node(struct device_node *np) +{ + WARN_ONCE(1, "SoundWire API is disabled"); + return NULL; +} + /* messaging and data APIs */ static inline int sdw_read(struct sdw_slave *slave, u32 addr) { -- cgit v1.2.3 From 2e07017b28e8bbace4a4973d11d0646575d36f94 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Tue, 9 Sep 2025 13:19:45 +0100 Subject: soundwire: bus: add sdw_slave_get_current_bank helper There has been 2 instances of this helper in codec drivers, it does not make sense to keep duplicating this part of code. Lets add a helper sdw_get_current_bank() for codec drivers to use it. Signed-off-by: Srinivas Kandagatla Acked-by: Vinod Koul Reviewed-by: Dmitry Baryshkov Link: https://patch.msgid.link/20250909121954.225833-5-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Mark Brown --- drivers/soundwire/bus.c | 12 ++++++++++++ include/linux/soundwire/sdw.h | 8 ++++++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/drivers/soundwire/bus.c b/drivers/soundwire/bus.c index 4fd5cac799c5..55c1db816534 100644 --- a/drivers/soundwire/bus.c +++ b/drivers/soundwire/bus.c @@ -1360,6 +1360,18 @@ int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base) } EXPORT_SYMBOL(sdw_slave_get_scale_index); +int sdw_slave_get_current_bank(struct sdw_slave *slave) +{ + int tmp; + + tmp = sdw_read(slave, SDW_SCP_CTRL); + if (tmp < 0) + return tmp; + + return FIELD_GET(SDW_SCP_STAT_CURR_BANK, tmp); +} +EXPORT_SYMBOL_GPL(sdw_slave_get_current_bank); + static int sdw_slave_set_frequency(struct sdw_slave *slave) { int scale_index; diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 096213956d31..e6a3476bcef1 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -1089,6 +1089,8 @@ int sdw_stream_remove_slave(struct sdw_slave *slave, struct device *of_sdw_find_device_by_node(struct device_node *np); +int sdw_slave_get_current_bank(struct sdw_slave *sdev); + int sdw_slave_get_scale_index(struct sdw_slave *slave, u8 *base); /* messaging and data APIs */ @@ -1128,6 +1130,12 @@ static inline struct device *of_sdw_find_device_by_node(struct device_node *np) return NULL; } +static inline int sdw_slave_get_current_bank(struct sdw_slave *sdev) +{ + WARN_ONCE(1, "SoundWire API is disabled"); + return -EINVAL; +} + /* messaging and data APIs */ static inline int sdw_read(struct sdw_slave *slave, u32 addr) { -- cgit v1.2.3 From ef1e734dbe257ce8bc42383b9977b5558f061288 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:48 +0300 Subject: power: supply: max77705_charger: use regfields for config registers Using regfields allows to cleanup masks and register offset definition, allowing to access register info by it's functional name. Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 105 ++++++++++++-------------------- include/linux/power/max77705_charger.h | 102 ++++++++++++++++--------------- 2 files changed, 93 insertions(+), 114 deletions(-) (limited to 'include/linux') diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index 883affd18c8d..837e03bafcc6 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -74,8 +74,7 @@ static int max77705_charger_enable(struct max77705_charger_data *chg) { int rv; - rv = regmap_update_bits(chg->regmap, MAX77705_CHG_REG_CNFG_09, - MAX77705_CHG_EN_MASK, MAX77705_CHG_EN_MASK); + rv = regmap_field_write(chg->rfield[MAX77705_CHG_EN], 1); if (rv) dev_err(chg->dev, "unable to enable the charger: %d\n", rv); @@ -87,10 +86,7 @@ static void max77705_charger_disable(void *data) struct max77705_charger_data *chg = data; int rv; - rv = regmap_update_bits(chg->regmap, - MAX77705_CHG_REG_CNFG_09, - MAX77705_CHG_EN_MASK, - MAX77705_CHG_DISABLE); + rv = regmap_field_write(chg->rfield[MAX77705_CHG_EN], MAX77705_CHG_DISABLE); if (rv) dev_err(chg->dev, "unable to disable the charger: %d\n", rv); } @@ -134,10 +130,10 @@ static int max77705_check_battery(struct max77705_charger_data *chg, int *val) static int max77705_get_charge_type(struct max77705_charger_data *chg, int *val) { struct regmap *regmap = chg->regmap; - unsigned int reg_data; + unsigned int reg_data, chg_en; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - if (!MAX77705_CHARGER_CHG_CHARGING(reg_data)) { + regmap_field_read(chg->rfield[MAX77705_CHG_EN], &chg_en); + if (!chg_en) { *val = POWER_SUPPLY_CHARGE_TYPE_NONE; return 0; } @@ -162,10 +158,10 @@ static int max77705_get_charge_type(struct max77705_charger_data *chg, int *val) static int max77705_get_status(struct max77705_charger_data *chg, int *val) { struct regmap *regmap = chg->regmap; - unsigned int reg_data; + unsigned int reg_data, chg_en; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - if (!MAX77705_CHARGER_CHG_CHARGING(reg_data)) { + regmap_field_read(chg->rfield[MAX77705_CHG_EN], &chg_en); + if (!chg_en) { *val = POWER_SUPPLY_CHARGE_TYPE_NONE; return 0; } @@ -295,16 +291,11 @@ static int max77705_get_input_current(struct max77705_charger_data *chg, { unsigned int reg_data; int get_current = 0; - struct regmap *regmap = chg->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_09, ®_data); - - reg_data &= MAX77705_CHG_CHGIN_LIM_MASK; + regmap_field_read(chg->rfield[MAX77705_CHG_CHGIN_LIM], ®_data); if (reg_data <= 3) get_current = MAX77705_CURRENT_CHGIN_MIN; - else if (reg_data >= MAX77705_CHG_CHGIN_LIM_MASK) - get_current = MAX77705_CURRENT_CHGIN_MAX; else get_current = (reg_data + 1) * MAX77705_CURRENT_CHGIN_STEP; @@ -317,10 +308,8 @@ static int max77705_get_charge_current(struct max77705_charger_data *chg, int *val) { unsigned int reg_data; - struct regmap *regmap = chg->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_02, ®_data); - reg_data &= MAX77705_CHG_CC; + regmap_field_read(chg->rfield[MAX77705_CHG_CC_LIM], ®_data); *val = reg_data <= 0x2 ? MAX77705_CURRENT_CHGIN_MIN : reg_data * MAX77705_CURRENT_CHG_STEP; @@ -332,7 +321,6 @@ static int max77705_set_float_voltage(struct max77705_charger_data *chg, { int float_voltage_mv; unsigned int reg_data = 0; - struct regmap *regmap = chg->regmap; float_voltage_mv = float_voltage / 1000; reg_data = float_voltage_mv <= 4000 ? 0x0 : @@ -340,9 +328,7 @@ static int max77705_set_float_voltage(struct max77705_charger_data *chg, (float_voltage_mv <= 4200) ? (float_voltage_mv - 4000) / 50 : (((float_voltage_mv - 4200) / 10) + 0x04); - return regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_04, - MAX77705_CHG_CV_PRM_MASK, - (reg_data << MAX77705_CHG_CV_PRM_SHIFT)); + return regmap_field_write(chg->rfield[MAX77705_CHG_CV_PRM], reg_data); } static int max77705_get_float_voltage(struct max77705_charger_data *chg, @@ -350,10 +336,8 @@ static int max77705_get_float_voltage(struct max77705_charger_data *chg, { unsigned int reg_data = 0; int voltage_mv; - struct regmap *regmap = chg->regmap; - regmap_read(regmap, MAX77705_CHG_REG_CNFG_04, ®_data); - reg_data &= MAX77705_CHG_PRM_MASK; + regmap_field_read(chg->rfield[MAX77705_CHG_CV_PRM], ®_data); voltage_mv = reg_data <= 0x04 ? reg_data * 50 + 4000 : (reg_data - 4) * 10 + 4200; *val = voltage_mv * 1000; @@ -418,7 +402,6 @@ static void max77705_chgin_isr_work(struct work_struct *work) static void max77705_charger_initialize(struct max77705_charger_data *chg) { - u8 reg_data; struct power_supply_battery_info *info; struct regmap *regmap = chg->regmap; @@ -429,45 +412,31 @@ static void max77705_charger_initialize(struct max77705_charger_data *chg) /* unlock charger setting protect */ /* slowest LX slope */ - reg_data = MAX77705_CHGPROT_MASK | MAX77705_SLOWEST_LX_SLOPE; - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_06, reg_data, - reg_data); + regmap_field_write(chg->rfield[MAX77705_CHGPROT], MAX77705_CHGPROT_UNLOCKED); + regmap_field_write(chg->rfield[MAX77705_LX_SLOPE], MAX77705_SLOWEST_LX_SLOPE); /* fast charge timer disable */ /* restart threshold disable */ /* pre-qual charge disable */ - reg_data = (MAX77705_FCHGTIME_DISABLE << MAX77705_FCHGTIME_SHIFT) | - (MAX77705_CHG_RSTRT_DISABLE << MAX77705_CHG_RSTRT_SHIFT) | - (MAX77705_CHG_PQEN_DISABLE << MAX77705_PQEN_SHIFT); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_01, - (MAX77705_FCHGTIME_MASK | - MAX77705_CHG_RSTRT_MASK | - MAX77705_PQEN_MASK), - reg_data); - - /* OTG off(UNO on), boost off */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_00, - MAX77705_OTG_CTRL, 0); + regmap_field_write(chg->rfield[MAX77705_FCHGTIME], MAX77705_FCHGTIME_DISABLE); + regmap_field_write(chg->rfield[MAX77705_CHG_RSTRT], MAX77705_CHG_RSTRT_DISABLE); + regmap_field_write(chg->rfield[MAX77705_CHG_PQEN], MAX77705_CHG_PQEN_DISABLE); + + regmap_field_write(chg->rfield[MAX77705_MODE], + MAX77705_CHG_MASK | MAX77705_BUCK_MASK); /* charge current 450mA(default) */ /* otg current limit 900mA */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_02, - MAX77705_OTG_ILIM_MASK, - MAX77705_OTG_ILIM_900 << MAX77705_OTG_ILIM_SHIFT); + regmap_field_write(chg->rfield[MAX77705_OTG_ILIM], MAX77705_OTG_ILIM_900); /* BAT to SYS OCP 4.80A */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_05, - MAX77705_REG_B2SOVRC_MASK, - MAX77705_B2SOVRC_4_8A << MAX77705_REG_B2SOVRC_SHIFT); + regmap_field_write(chg->rfield[MAX77705_REG_B2SOVRC], MAX77705_B2SOVRC_4_8A); + /* top off current 150mA */ /* top off timer 30min */ - reg_data = (MAX77705_TO_ITH_150MA << MAX77705_TO_ITH_SHIFT) | - (MAX77705_TO_TIME_30M << MAX77705_TO_TIME_SHIFT) | - (MAX77705_SYS_TRACK_DISABLE << MAX77705_SYS_TRACK_DIS_SHIFT); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_03, - (MAX77705_TO_ITH_MASK | - MAX77705_TO_TIME_MASK | - MAX77705_SYS_TRACK_DIS_MASK), reg_data); + regmap_field_write(chg->rfield[MAX77705_TO], MAX77705_TO_ITH_150MA); + regmap_field_write(chg->rfield[MAX77705_TO_TIME], MAX77705_TO_TIME_30M); + regmap_field_write(chg->rfield[MAX77705_SYS_TRACK], MAX77705_SYS_TRACK_DISABLE); /* cv voltage 4.2V or 4.35V */ /* MINVSYS 3.6V(default) */ @@ -478,25 +447,21 @@ static void max77705_charger_initialize(struct max77705_charger_data *chg) max77705_set_float_voltage(chg, info->voltage_max_design_uv); } - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, - MAX77705_VCHGIN_REG_MASK, MAX77705_VCHGIN_4_5); - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, - MAX77705_WCIN_REG_MASK, MAX77705_WCIN_4_5); + regmap_field_write(chg->rfield[MAX77705_VCHGIN], MAX77705_VCHGIN_4_5); + regmap_field_write(chg->rfield[MAX77705_WCIN], MAX77705_WCIN_4_5); /* Watchdog timer */ regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_00, MAX77705_WDTEN_MASK, 0); /* VBYPSET=5.0V */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_11, MAX77705_VBYPSET_MASK, 0); + regmap_field_write(chg->rfield[MAX77705_VBYPSET], 0); /* Switching Frequency : 1.5MHz */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_08, MAX77705_REG_FSW_MASK, - (MAX77705_CHG_FSW_1_5MHz << MAX77705_REG_FSW_SHIFT)); + regmap_field_write(chg->rfield[MAX77705_REG_FSW], MAX77705_CHG_FSW_1_5MHz); /* Auto skip mode */ - regmap_update_bits(regmap, MAX77705_CHG_REG_CNFG_12, MAX77705_REG_DISKIP_MASK, - (MAX77705_AUTO_SKIP << MAX77705_REG_DISKIP_SHIFT)); + regmap_field_write(chg->rfield[MAX77705_REG_DISKIP], MAX77705_AUTO_SKIP); } static int max77705_charger_probe(struct i2c_client *i2c) @@ -520,6 +485,14 @@ static int max77705_charger_probe(struct i2c_client *i2c) if (IS_ERR(chg->regmap)) return PTR_ERR(chg->regmap); + for (int i = 0; i < MAX77705_N_REGMAP_FIELDS; i++) { + chg->rfield[i] = devm_regmap_field_alloc(dev, chg->regmap, + max77705_reg_field[i]); + if (IS_ERR(chg->rfield[i])) + return dev_err_probe(dev, PTR_ERR(chg->rfield[i]), + "cannot allocate regmap field\n"); + } + ret = regmap_update_bits(chg->regmap, MAX77705_CHG_REG_INT_MASK, MAX77705_CHGIN_IM, 0); diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h index fdec9af9c541..a612795577b6 100644 --- a/include/linux/power/max77705_charger.h +++ b/include/linux/power/max77705_charger.h @@ -9,6 +9,8 @@ #ifndef __MAX77705_CHARGER_H #define __MAX77705_CHARGER_H __FILE__ +#include + /* MAX77705_CHG_REG_CHG_INT */ #define MAX77705_BYP_I BIT(0) #define MAX77705_INP_LIMIT_I BIT(1) @@ -63,7 +65,6 @@ #define MAX77705_BUCK_SHIFT 2 #define MAX77705_BOOST_SHIFT 3 #define MAX77705_WDTEN_SHIFT 4 -#define MAX77705_MODE_MASK GENMASK(3, 0) #define MAX77705_CHG_MASK BIT(MAX77705_CHG_SHIFT) #define MAX77705_UNO_MASK BIT(MAX77705_UNO_SHIFT) #define MAX77705_OTG_MASK BIT(MAX77705_OTG_SHIFT) @@ -74,34 +75,19 @@ #define MAX77705_OTG_CTRL (MAX77705_OTG_MASK | MAX77705_BOOST_MASK) /* MAX77705_CHG_REG_CNFG_01 */ -#define MAX77705_FCHGTIME_SHIFT 0 -#define MAX77705_FCHGTIME_MASK GENMASK(2, 0) -#define MAX77705_CHG_RSTRT_SHIFT 4 -#define MAX77705_CHG_RSTRT_MASK GENMASK(5, 4) #define MAX77705_FCHGTIME_DISABLE 0 #define MAX77705_CHG_RSTRT_DISABLE 0x3 -#define MAX77705_PQEN_SHIFT 7 -#define MAX77705_PQEN_MASK BIT(7) #define MAX77705_CHG_PQEN_DISABLE 0 #define MAX77705_CHG_PQEN_ENABLE 1 /* MAX77705_CHG_REG_CNFG_02 */ -#define MAX77705_OTG_ILIM_SHIFT 6 -#define MAX77705_OTG_ILIM_MASK GENMASK(7, 6) #define MAX77705_OTG_ILIM_500 0 #define MAX77705_OTG_ILIM_900 1 #define MAX77705_OTG_ILIM_1200 2 #define MAX77705_OTG_ILIM_1500 3 -#define MAX77705_CHG_CC GENMASK(5, 0) /* MAX77705_CHG_REG_CNFG_03 */ -#define MAX77705_TO_ITH_SHIFT 0 -#define MAX77705_TO_ITH_MASK GENMASK(2, 0) -#define MAX77705_TO_TIME_SHIFT 3 -#define MAX77705_TO_TIME_MASK GENMASK(5, 3) -#define MAX77705_SYS_TRACK_DIS_SHIFT 7 -#define MAX77705_SYS_TRACK_DIS_MASK BIT(7) #define MAX77705_TO_ITH_150MA 0 #define MAX77705_TO_TIME_30M 3 #define MAX77705_SYS_TRACK_ENABLE 0 @@ -110,15 +96,8 @@ /* MAX77705_CHG_REG_CNFG_04 */ #define MAX77705_CHG_MINVSYS_SHIFT 6 #define MAX77705_CHG_MINVSYS_MASK GENMASK(7, 6) -#define MAX77705_CHG_PRM_SHIFT 0 -#define MAX77705_CHG_PRM_MASK GENMASK(5, 0) - -#define MAX77705_CHG_CV_PRM_SHIFT 0 -#define MAX77705_CHG_CV_PRM_MASK GENMASK(5, 0) /* MAX77705_CHG_REG_CNFG_05 */ -#define MAX77705_REG_B2SOVRC_SHIFT 0 -#define MAX77705_REG_B2SOVRC_MASK GENMASK(3, 0) #define MAX77705_B2SOVRC_DISABLE 0 #define MAX77705_B2SOVRC_4_5A 6 #define MAX77705_B2SOVRC_4_8A 8 @@ -128,9 +107,8 @@ #define MAX77705_WDTCLR_SHIFT 0 #define MAX77705_WDTCLR_MASK GENMASK(1, 0) #define MAX77705_WDTCLR 1 -#define MAX77705_CHGPROT_MASK GENMASK(3, 2) -#define MAX77705_CHGPROT_UNLOCKED GENMASK(3, 2) -#define MAX77705_SLOWEST_LX_SLOPE GENMASK(6, 5) +#define MAX77705_CHGPROT_UNLOCKED 3 +#define MAX77705_SLOWEST_LX_SLOPE 3 /* MAX77705_CHG_REG_CNFG_07 */ #define MAX77705_CHG_FMBST 4 @@ -140,36 +118,14 @@ #define MAX77705_REG_FGSRC_MASK BIT(MAX77705_REG_FGSRC_SHIFT) /* MAX77705_CHG_REG_CNFG_08 */ -#define MAX77705_REG_FSW_SHIFT 0 -#define MAX77705_REG_FSW_MASK GENMASK(1, 0) #define MAX77705_CHG_FSW_3MHz 0 #define MAX77705_CHG_FSW_2MHz 1 #define MAX77705_CHG_FSW_1_5MHz 2 /* MAX77705_CHG_REG_CNFG_09 */ -#define MAX77705_CHG_CHGIN_LIM_MASK GENMASK(6, 0) -#define MAX77705_CHG_EN_MASK BIT(7) #define MAX77705_CHG_DISABLE 0 -#define MAX77705_CHARGER_CHG_CHARGING(_reg) \ - (((_reg) & MAX77705_CHG_EN_MASK) > 1) - - -/* MAX77705_CHG_REG_CNFG_10 */ -#define MAX77705_CHG_WCIN_LIM GENMASK(5, 0) - -/* MAX77705_CHG_REG_CNFG_11 */ -#define MAX77705_VBYPSET_SHIFT 0 -#define MAX77705_VBYPSET_MASK GENMASK(6, 0) /* MAX77705_CHG_REG_CNFG_12 */ -#define MAX77705_CHGINSEL_SHIFT 5 -#define MAX77705_CHGINSEL_MASK BIT(MAX77705_CHGINSEL_SHIFT) -#define MAX77705_WCINSEL_SHIFT 6 -#define MAX77705_WCINSEL_MASK BIT(MAX77705_WCINSEL_SHIFT) -#define MAX77705_VCHGIN_REG_MASK GENMASK(4, 3) -#define MAX77705_WCIN_REG_MASK GENMASK(2, 1) -#define MAX77705_REG_DISKIP_SHIFT 0 -#define MAX77705_REG_DISKIP_MASK BIT(MAX77705_REG_DISKIP_SHIFT) /* REG=4.5V, UVLO=4.7V */ #define MAX77705_VCHGIN_4_5 0 /* REG=4.5V, UVLO=4.7V */ @@ -183,9 +139,59 @@ #define MAX77705_CURRENT_CHGIN_MIN 100000 #define MAX77705_CURRENT_CHGIN_MAX 3200000 +enum max77705_field_idx { + MAX77705_CHGPROT, + MAX77705_CHG_EN, + MAX77705_CHG_CC_LIM, + MAX77705_CHG_CHGIN_LIM, + MAX77705_CHG_CV_PRM, + MAX77705_CHG_PQEN, + MAX77705_CHG_RSTRT, + MAX77705_CHG_WCIN, + MAX77705_FCHGTIME, + MAX77705_LX_SLOPE, + MAX77705_MODE, + MAX77705_OTG_ILIM, + MAX77705_REG_B2SOVRC, + MAX77705_REG_DISKIP, + MAX77705_REG_FSW, + MAX77705_SYS_TRACK, + MAX77705_TO, + MAX77705_TO_TIME, + MAX77705_VBYPSET, + MAX77705_VCHGIN, + MAX77705_WCIN, + MAX77705_N_REGMAP_FIELDS, +}; + +static const struct reg_field max77705_reg_field[MAX77705_N_REGMAP_FIELDS] = { + [MAX77705_MODE] = REG_FIELD(MAX77705_CHG_REG_CNFG_00, 0, 3), + [MAX77705_FCHGTIME] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 0, 2), + [MAX77705_CHG_RSTRT] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 4, 5), + [MAX77705_CHG_PQEN] = REG_FIELD(MAX77705_CHG_REG_CNFG_01, 7, 7), + [MAX77705_CHG_CC_LIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_02, 0, 5), + [MAX77705_OTG_ILIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_02, 6, 7), + [MAX77705_TO] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 0, 2), + [MAX77705_TO_TIME] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 3, 5), + [MAX77705_SYS_TRACK] = REG_FIELD(MAX77705_CHG_REG_CNFG_03, 7, 7), + [MAX77705_CHG_CV_PRM] = REG_FIELD(MAX77705_CHG_REG_CNFG_04, 0, 5), + [MAX77705_REG_B2SOVRC] = REG_FIELD(MAX77705_CHG_REG_CNFG_05, 0, 3), + [MAX77705_CHGPROT] = REG_FIELD(MAX77705_CHG_REG_CNFG_06, 2, 3), + [MAX77705_LX_SLOPE] = REG_FIELD(MAX77705_CHG_REG_CNFG_06, 5, 6), + [MAX77705_REG_FSW] = REG_FIELD(MAX77705_CHG_REG_CNFG_08, 0, 1), + [MAX77705_CHG_CHGIN_LIM] = REG_FIELD(MAX77705_CHG_REG_CNFG_09, 0, 6), + [MAX77705_CHG_EN] = REG_FIELD(MAX77705_CHG_REG_CNFG_09, 7, 7), + [MAX77705_CHG_WCIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_10, 0, 5), + [MAX77705_VBYPSET] = REG_FIELD(MAX77705_CHG_REG_CNFG_11, 0, 6), + [MAX77705_REG_DISKIP] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 0, 0), + [MAX77705_WCIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 1, 2), + [MAX77705_VCHGIN] = REG_FIELD(MAX77705_CHG_REG_CNFG_12, 3, 4), +}; + struct max77705_charger_data { struct device *dev; struct regmap *regmap; + struct regmap_field *rfield[MAX77705_N_REGMAP_FIELDS]; struct power_supply_battery_info *bat_info; struct workqueue_struct *wqueue; struct work_struct chgin_work; -- cgit v1.2.3 From bc7d3a0f92dad811110f5602f58fe756cefce2b8 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 18 Sep 2025 20:06:52 +0300 Subject: power: supply: max77705_charger: use REGMAP_IRQ_REG_LINE macro Refactor regmap_irq declarations with REGMAP_IRQ_REG_LINE saves a few lines on definitions. Signed-off-by: Dzmitry Sankouski Signed-off-by: Sebastian Reichel --- drivers/power/supply/max77705_charger.c | 16 ++++++------- include/linux/power/max77705_charger.h | 42 +++++++++++++-------------------- 2 files changed, 24 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/drivers/power/supply/max77705_charger.c b/drivers/power/supply/max77705_charger.c index bf065693af45..b1a227bf72e2 100644 --- a/drivers/power/supply/max77705_charger.c +++ b/drivers/power/supply/max77705_charger.c @@ -50,14 +50,14 @@ static irqreturn_t max77705_chgin_irq(int irq, void *irq_drv_data) } static const struct regmap_irq max77705_charger_irqs[] = { - { .mask = MAX77705_BYP_IM, }, - { .mask = MAX77705_INP_LIMIT_IM, }, - { .mask = MAX77705_BATP_IM, }, - { .mask = MAX77705_BAT_IM, }, - { .mask = MAX77705_CHG_IM, }, - { .mask = MAX77705_WCIN_IM, }, - { .mask = MAX77705_CHGIN_IM, }, - { .mask = MAX77705_AICL_IM, }, + REGMAP_IRQ_REG_LINE(MAX77705_BYP_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_INP_LIMIT_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_BATP_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_BAT_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_CHG_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_WCIN_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_CHGIN_I, BITS_PER_BYTE), + REGMAP_IRQ_REG_LINE(MAX77705_AICL_I, BITS_PER_BYTE), }; static struct regmap_irq_chip max77705_charger_irq_chip = { diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h index a612795577b6..6653abfdf747 100644 --- a/include/linux/power/max77705_charger.h +++ b/include/linux/power/max77705_charger.h @@ -12,34 +12,24 @@ #include /* MAX77705_CHG_REG_CHG_INT */ -#define MAX77705_BYP_I BIT(0) -#define MAX77705_INP_LIMIT_I BIT(1) -#define MAX77705_BATP_I BIT(2) -#define MAX77705_BAT_I BIT(3) -#define MAX77705_CHG_I BIT(4) -#define MAX77705_WCIN_I BIT(5) -#define MAX77705_CHGIN_I BIT(6) -#define MAX77705_AICL_I BIT(7) - -/* MAX77705_CHG_REG_CHG_INT_MASK */ -#define MAX77705_BYP_IM BIT(0) -#define MAX77705_INP_LIMIT_IM BIT(1) -#define MAX77705_BATP_IM BIT(2) -#define MAX77705_BAT_IM BIT(3) -#define MAX77705_CHG_IM BIT(4) -#define MAX77705_WCIN_IM BIT(5) -#define MAX77705_CHGIN_IM BIT(6) -#define MAX77705_AICL_IM BIT(7) +#define MAX77705_BYP_I (0) +#define MAX77705_INP_LIMIT_I (1) +#define MAX77705_BATP_I (2) +#define MAX77705_BAT_I (3) +#define MAX77705_CHG_I (4) +#define MAX77705_WCIN_I (5) +#define MAX77705_CHGIN_I (6) +#define MAX77705_AICL_I (7) /* MAX77705_CHG_REG_CHG_INT_OK */ -#define MAX77705_BYP_OK BIT(0) -#define MAX77705_DISQBAT_OK BIT(1) -#define MAX77705_BATP_OK BIT(2) -#define MAX77705_BAT_OK BIT(3) -#define MAX77705_CHG_OK BIT(4) -#define MAX77705_WCIN_OK BIT(5) -#define MAX77705_CHGIN_OK BIT(6) -#define MAX77705_AICL_OK BIT(7) +#define MAX77705_BYP_OK BIT(MAX77705_BYP_I) +#define MAX77705_DISQBAT_OK BIT(MAX77705_INP_LIMIT_I) +#define MAX77705_BATP_OK BIT(MAX77705_BATP_I) +#define MAX77705_BAT_OK BIT(MAX77705_BAT_I) +#define MAX77705_CHG_OK BIT(MAX77705_CHG_I) +#define MAX77705_WCIN_OK BIT(MAX77705_WCIN_I) +#define MAX77705_CHGIN_OK BIT(MAX77705_CHGIN_I) +#define MAX77705_AICL_OK BIT(MAX77705_AICL_I) /* MAX77705_CHG_REG_DETAILS_00 */ #define MAX77705_BATP_DTLS BIT(0) -- cgit v1.2.3 From 603b4416232524dafde8e2cf859788dae786dea1 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:30 +0200 Subject: bpf: Update the bpf_prog_calc_tag to use SHA256 Exclusive maps restrict map access to specific programs using a hash. The current hash used for this is SHA1, which is prone to collisions. This patch uses SHA256, which is more resilient against collisions. This new hash is stored in bpf_prog and used by the verifier to determine if a program can access a given exclusive map. The original 64-bit tags are kept, as they are used by users as a short, possibly colliding program identifier for non-security purposes. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-2-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++++- kernel/bpf/Kconfig | 2 +- kernel/bpf/core.c | 5 ++--- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 41f776071ff5..d75902074bd1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -1717,7 +1718,10 @@ struct bpf_prog { enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ - u8 tag[BPF_TAG_SIZE]; + union { + u8 digest[SHA256_DIGEST_SIZE]; + u8 tag[BPF_TAG_SIZE]; + }; struct bpf_prog_stats __percpu *stats; int __percpu *active; unsigned int (*bpf_func)(const void *ctx, diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 17067dcb4386..eb3de35734f0 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -3,7 +3,7 @@ # BPF interpreter that, for example, classic socket filters depend on. config BPF bool - select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 # Used by archs to tell that they support BPF JIT compiler plus which # flavour. Only one of the two can be selected for a specific arch since diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1cda2589d4b3..9b64674df16b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -296,7 +297,6 @@ void __bpf_prog_free(struct bpf_prog *fp) int bpf_prog_calc_tag(struct bpf_prog *fp) { size_t size = bpf_prog_insn_size(fp); - u8 digest[SHA1_DIGEST_SIZE]; struct bpf_insn *dst; bool was_ld_map; u32 i; @@ -327,8 +327,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) was_ld_map = false; } } - sha1((const u8 *)dst, size, digest); - memcpy(fp->tag, digest, sizeof(fp->tag)); + sha256((u8 *)dst, size, fp->digest); vfree(dst); return 0; } -- cgit v1.2.3 From baefdbdf6812e120c9fba9cfb101d3656f478026 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:31 +0200 Subject: bpf: Implement exclusive map creation Exclusive maps allow maps to only be accessed by program with a program with a matching hash which is specified in the excl_prog_hash attr. For the signing use-case, this allows the trusted loader program to load the map and verify the integrity Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-3-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++++ kernel/bpf/syscall.c | 31 +++++++++++++++++++++++++++---- kernel/bpf/verifier.c | 6 ++++++ tools/include/uapi/linux/bpf.h | 6 ++++++ 5 files changed, 46 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d75902074bd1..c6a6ee1b2938 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -329,6 +329,7 @@ struct bpf_map { atomic64_t sleepable_refcnt; s64 __percpu *elem_count; u64 cookie; /* write-once */ + char *excl_prog_sha; }; static inline const char *btf_field_type_name(enum btf_field_type type) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 233de8677382..57687b2e1c47 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3f178a0f8eb1..c8ef91acfe98 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -860,6 +860,7 @@ static void bpf_map_free(struct bpf_map *map) * the free of values or special fields allocated from bpf memory * allocator. */ + kfree(map->excl_prog_sha); migrate_disable(); map->ops->map_free(map); migrate_enable(); @@ -1338,9 +1339,9 @@ static bool bpf_net_capable(void) return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } -#define BPF_MAP_CREATE_LAST_FIELD map_token_fd +#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bool kernel) +static int map_create(union bpf_attr *attr, bpfptr_t uattr) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1534,7 +1535,29 @@ static int map_create(union bpf_attr *attr, bool kernel) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token, kernel); + if (attr->excl_prog_hash) { + bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); + + if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + err = -EINVAL; + goto free_map; + } + + map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!map->excl_prog_sha) { + err = -ENOMEM; + goto free_map; + } + + if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) { + err = -EFAULT; + goto free_map; + } + } else if (attr->excl_prog_hash_size) { + return -EINVAL; + } + + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -6008,7 +6031,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr.is_kernel); + err = map_create(&attr, uattr); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6625570ac23d..aef6b266f08d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20407,6 +20407,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); + if (map->excl_prog_sha && + memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) { + verbose(env, "program's hash doesn't match map's excl_prog_hash\n"); + return -EACCES; + } + if (btf_record_has_field(map->record, BPF_LIST_HEAD) || btf_record_has_field(map->record, BPF_RB_ROOT)) { if (is_tracing_prog_type(prog_type)) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 233de8677382..57687b2e1c47 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ -- cgit v1.2.3 From ea2e6467ac36bf3d785defc89e58269b15d182f7 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:35 +0200 Subject: bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD Currently only array maps are supported, but the implementation can be extended for other maps and objects. The hash is memoized only for exclusive and frozen maps as their content is stable until the exclusive program modifies the map. This is required for BPF signing, enabling a trusted loader program to verify a map's integrity. The loader retrieves the map's runtime hash from the kernel and compares it against an expected hash computed at build time. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-7-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 13 ++++++++++++ kernel/bpf/syscall.c | 23 ++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ .../testing/selftests/bpf/progs/verifier_map_ptr.c | 7 +++++-- 6 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c6a6ee1b2938..e0c2c78a5faa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -110,6 +111,7 @@ struct bpf_map_ops { long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); + int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -289,6 +291,7 @@ struct bpf_map_owner { }; struct bpf_map { + u8 sha[SHA256_DIGEST_SIZE]; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 57687b2e1c47..0987b52d5648 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6672,6 +6672,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d080916faf9..26d5dda989bc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "map_in_map.h" @@ -174,6 +175,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } +static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, + void *hash_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + sha256(array->value, (u64)array->elem_size * array->map.max_entries, + hash_buf); + memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + return 0; +} + static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { @@ -800,6 +812,7 @@ const struct bpf_map_ops array_map_ops = { .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, + .map_get_hash = &array_map_get_hash, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c8ef91acfe98..cf7173b1bb83 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ +#include #include #include #include @@ -5184,6 +5185,9 @@ static int bpf_map_get_info_by_fd(struct file *file, info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -5208,6 +5212,25 @@ static int bpf_map_get_info_by_fd(struct file *file, return err; } + if (info.hash) { + char __user *uhash = u64_to_user_ptr(info.hash); + + if (!map->ops->map_get_hash) + return -EINVAL; + + if (info.hash_size != SHA256_DIGEST_SIZE) + return -EINVAL; + + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + if (err != 0) + return err; + + if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + return -EFAULT; + } else if (info.hash_size) { + return -EINVAL; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 57687b2e1c47..0987b52d5648 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6672,6 +6672,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c index 11a079145966..e2767d27d8aa 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c @@ -70,10 +70,13 @@ __naked void bpf_map_ptr_write_rejected(void) : __clobber_all); } +/* The first element of struct bpf_map is a SHA256 hash of 32 bytes, accessing + * into this array is valid. The opts field is now at offset 33. + */ SEC("socket") __description("bpf_map_ptr: read non-existent field rejected") __failure -__msg("cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4") +__msg("cannot access ptr member ops with moff 32 in struct bpf_map with off 33 size 4") __failure_unpriv __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") __flag(BPF_F_ANY_ALIGNMENT) @@ -82,7 +85,7 @@ __naked void read_non_existent_field_rejected(void) asm volatile (" \ r6 = 0; \ r1 = %[map_array_48b] ll; \ - r6 = *(u32*)(r1 + 1); \ + r6 = *(u32*)(r1 + 33); \ r0 = 1; \ exit; \ " : -- cgit v1.2.3 From 8cd189e414bb705312fbfff7f7b5605f6de2459a Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:36 +0200 Subject: bpf: Move the signature kfuncs to helpers.c No functional changes, except for the addition of the headers for the kfuncs so that they can be used for signature verification. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-8-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 32 +++++++++ kernel/bpf/helpers.c | 166 ++++++++++++++++++++++++++++++++++++++++++ kernel/trace/bpf_trace.c | 183 ----------------------------------------------- 3 files changed, 198 insertions(+), 183 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e0c2c78a5faa..dfc1a27b56d5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3424,6 +3424,38 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, #endif /* CONFIG_BPF_SYSCALL */ #endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ +#if defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) + +struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags); +struct bpf_key *bpf_lookup_system_key(u64 id); +void bpf_key_put(struct bpf_key *bkey); +int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring); + +#else +static inline struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +{ + return NULL; +} + +static inline struct bpf_key *bpf_lookup_system_key(u64 id) +{ + return NULL; +} + +static inline void bpf_key_put(struct bpf_key *bkey) +{ +} + +static inline int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ + return -EOPNOTSUPP; +} +#endif /* defined(CONFIG_KEYS) && defined(CONFIG_BPF_SYSCALL) */ + /* verifier prototypes for helper functions called from eBPF programs */ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 51229aba5318..ef4ede8bb74f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -3747,6 +3748,163 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) { return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); } +#ifdef CONFIG_KEYS +/** + * bpf_lookup_user_key - lookup a key by its serial + * @serial: key handle serial number + * @flags: lookup-specific flags + * + * Search a key with a given *serial* and the provided *flags*. + * If found, increment the reference count of the key by one, and + * return it in the bpf_key structure. + * + * The bpf_key structure must be passed to bpf_key_put() when done + * with it, so that the key reference count is decremented and the + * bpf_key structure is freed. + * + * Permission checks are deferred to the time the key is used by + * one of the available key-specific kfuncs. + * + * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested + * special keyring (e.g. session keyring), if it doesn't yet exist. + * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting + * for the key construction, and to retrieve uninstantiated keys (keys + * without data attached to them). + * + * Return: a bpf_key pointer with a valid key pointer if the key is found, a + * NULL pointer otherwise. + */ +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) +{ + key_ref_t key_ref; + struct bpf_key *bkey; + + if (flags & ~KEY_LOOKUP_ALL) + return NULL; + + /* + * Permission check is deferred until the key is used, as the + * intent of the caller is unknown here. + */ + key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); + if (IS_ERR(key_ref)) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); + if (!bkey) { + key_put(key_ref_to_ptr(key_ref)); + return NULL; + } + + bkey->key = key_ref_to_ptr(key_ref); + bkey->has_ref = true; + + return bkey; +} + +/** + * bpf_lookup_system_key - lookup a key by a system-defined ID + * @id: key ID + * + * Obtain a bpf_key structure with a key pointer set to the passed key ID. + * The key pointer is marked as invalid, to prevent bpf_key_put() from + * attempting to decrement the key reference count on that pointer. The key + * pointer set in such way is currently understood only by + * verify_pkcs7_signature(). + * + * Set *id* to one of the values defined in include/linux/verification.h: + * 0 for the primary keyring (immutable keyring of system keys); + * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring + * (where keys can be added only if they are vouched for by existing keys + * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform + * keyring (primarily used by the integrity subsystem to verify a kexec'ed + * kerned image and, possibly, the initramfs signature). + * + * Return: a bpf_key pointer with an invalid key pointer set from the + * pre-determined ID on success, a NULL pointer otherwise + */ +__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) +{ + struct bpf_key *bkey; + + if (system_keyring_id_check(id) < 0) + return NULL; + + bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); + if (!bkey) + return NULL; + + bkey->key = (struct key *)(unsigned long)id; + bkey->has_ref = false; + + return bkey; +} + +/** + * bpf_key_put - decrement key reference count if key is valid and free bpf_key + * @bkey: bpf_key structure + * + * Decrement the reference count of the key inside *bkey*, if the pointer + * is valid, and free *bkey*. + */ +__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) +{ + if (bkey->has_ref) + key_put(bkey->key); + + kfree(bkey); +} + +/** + * bpf_verify_pkcs7_signature - verify a PKCS#7 signature + * @data_p: data to verify + * @sig_p: signature of the data + * @trusted_keyring: keyring with keys trusted for signature verification + * + * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* + * with keys in a keyring referenced by *trusted_keyring*. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, + struct bpf_dynptr *sig_p, + struct bpf_key *trusted_keyring) +{ +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION + struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; + struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; + const void *data, *sig; + u32 data_len, sig_len; + int ret; + + if (trusted_keyring->has_ref) { + /* + * Do the permission check deferred in bpf_lookup_user_key(). + * See bpf_lookup_user_key() for more details. + * + * A call to key_task_permission() here would be redundant, as + * it is already done by keyring_search() called by + * find_asymmetric_key(). + */ + ret = key_validate(trusted_keyring->key); + if (ret < 0) + return ret; + } + + data_len = __bpf_dynptr_size(data_ptr); + data = __bpf_dynptr_data(data_ptr, data_len); + sig_len = __bpf_dynptr_size(sig_ptr); + sig = __bpf_dynptr_data(sig_ptr, sig_len); + + return verify_pkcs7_signature(data, data_len, sig, sig_len, + trusted_keyring->key, + VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + NULL); +#else + return -EOPNOTSUPP; +#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ +} +#endif /* CONFIG_KEYS */ __bpf_kfunc_end_defs(); @@ -3788,6 +3946,14 @@ BTF_ID_FLAGS(func, bpf_throw) #ifdef CONFIG_BPF_EVENTS BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS) #endif +#ifdef CONFIG_KEYS +BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) +BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) +#ifdef CONFIG_SYSTEM_DATA_VERIFICATION +BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) +#endif +#endif BTF_KFUNCS_END(generic_btf_ids) static const struct btf_kfunc_id_set generic_kfunc_set = { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 606007c387c5..f2360579658e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -1241,188 +1240,6 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_KEYS -__bpf_kfunc_start_defs(); - -/** - * bpf_lookup_user_key - lookup a key by its serial - * @serial: key handle serial number - * @flags: lookup-specific flags - * - * Search a key with a given *serial* and the provided *flags*. - * If found, increment the reference count of the key by one, and - * return it in the bpf_key structure. - * - * The bpf_key structure must be passed to bpf_key_put() when done - * with it, so that the key reference count is decremented and the - * bpf_key structure is freed. - * - * Permission checks are deferred to the time the key is used by - * one of the available key-specific kfuncs. - * - * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested - * special keyring (e.g. session keyring), if it doesn't yet exist. - * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting - * for the key construction, and to retrieve uninstantiated keys (keys - * without data attached to them). - * - * Return: a bpf_key pointer with a valid key pointer if the key is found, a - * NULL pointer otherwise. - */ -__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) -{ - key_ref_t key_ref; - struct bpf_key *bkey; - - if (flags & ~KEY_LOOKUP_ALL) - return NULL; - - /* - * Permission check is deferred until the key is used, as the - * intent of the caller is unknown here. - */ - key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK); - if (IS_ERR(key_ref)) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_KERNEL); - if (!bkey) { - key_put(key_ref_to_ptr(key_ref)); - return NULL; - } - - bkey->key = key_ref_to_ptr(key_ref); - bkey->has_ref = true; - - return bkey; -} - -/** - * bpf_lookup_system_key - lookup a key by a system-defined ID - * @id: key ID - * - * Obtain a bpf_key structure with a key pointer set to the passed key ID. - * The key pointer is marked as invalid, to prevent bpf_key_put() from - * attempting to decrement the key reference count on that pointer. The key - * pointer set in such way is currently understood only by - * verify_pkcs7_signature(). - * - * Set *id* to one of the values defined in include/linux/verification.h: - * 0 for the primary keyring (immutable keyring of system keys); - * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring - * (where keys can be added only if they are vouched for by existing keys - * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform - * keyring (primarily used by the integrity subsystem to verify a kexec'ed - * kerned image and, possibly, the initramfs signature). - * - * Return: a bpf_key pointer with an invalid key pointer set from the - * pre-determined ID on success, a NULL pointer otherwise - */ -__bpf_kfunc struct bpf_key *bpf_lookup_system_key(u64 id) -{ - struct bpf_key *bkey; - - if (system_keyring_id_check(id) < 0) - return NULL; - - bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC); - if (!bkey) - return NULL; - - bkey->key = (struct key *)(unsigned long)id; - bkey->has_ref = false; - - return bkey; -} - -/** - * bpf_key_put - decrement key reference count if key is valid and free bpf_key - * @bkey: bpf_key structure - * - * Decrement the reference count of the key inside *bkey*, if the pointer - * is valid, and free *bkey*. - */ -__bpf_kfunc void bpf_key_put(struct bpf_key *bkey) -{ - if (bkey->has_ref) - key_put(bkey->key); - - kfree(bkey); -} - -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -/** - * bpf_verify_pkcs7_signature - verify a PKCS#7 signature - * @data_p: data to verify - * @sig_p: signature of the data - * @trusted_keyring: keyring with keys trusted for signature verification - * - * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr* - * with keys in a keyring referenced by *trusted_keyring*. - * - * Return: 0 on success, a negative value on error. - */ -__bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, - struct bpf_dynptr *sig_p, - struct bpf_key *trusted_keyring) -{ - struct bpf_dynptr_kern *data_ptr = (struct bpf_dynptr_kern *)data_p; - struct bpf_dynptr_kern *sig_ptr = (struct bpf_dynptr_kern *)sig_p; - const void *data, *sig; - u32 data_len, sig_len; - int ret; - - if (trusted_keyring->has_ref) { - /* - * Do the permission check deferred in bpf_lookup_user_key(). - * See bpf_lookup_user_key() for more details. - * - * A call to key_task_permission() here would be redundant, as - * it is already done by keyring_search() called by - * find_asymmetric_key(). - */ - ret = key_validate(trusted_keyring->key); - if (ret < 0) - return ret; - } - - data_len = __bpf_dynptr_size(data_ptr); - data = __bpf_dynptr_data(data_ptr, data_len); - sig_len = __bpf_dynptr_size(sig_ptr); - sig = __bpf_dynptr_data(sig_ptr, sig_len); - - return verify_pkcs7_signature(data, data_len, sig, sig_len, - trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, - NULL); -} -#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */ - -__bpf_kfunc_end_defs(); - -BTF_KFUNCS_START(key_sig_kfunc_set) -BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE) -BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE) -#ifdef CONFIG_SYSTEM_DATA_VERIFICATION -BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE) -#endif -BTF_KFUNCS_END(key_sig_kfunc_set) - -static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = { - .owner = THIS_MODULE, - .set = &key_sig_kfunc_set, -}; - -static int __init bpf_key_sig_kfuncs_init(void) -{ - return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, - &bpf_key_sig_kfunc_set); -} - -late_initcall(bpf_key_sig_kfuncs_init); -#endif /* CONFIG_KEYS */ - static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { -- cgit v1.2.3 From b9c3d426c8a5823b3a1e5078719750c6abb0d2c1 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 8 Sep 2025 14:12:59 +0300 Subject: wifi: cfg80211: Advertise supported NAN capabilities Allow drivers to specify the supported NAN capabilities and support advertising the NAN capabilities to user space. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.2976966556f5.Ic6e43b10049573180c909dad806f279cfb31143e@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 17 +++++++++++++++++ include/net/cfg80211.h | 38 ++++++++++++++++++++++++++++++++++++++ net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d350263f23f3..2110345de8ef 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -6065,4 +6065,21 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) _data + ieee80211_mle_common_size(_data),\ _len - ieee80211_mle_common_size(_data)) +/* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */ +#define NAN_OP_MODE_PHY_MODE_VHT 0x01 +#define NAN_OP_MODE_PHY_MODE_HE 0x10 +#define NAN_OP_MODE_PHY_MODE_MASK 0x11 +#define NAN_OP_MODE_80P80MHZ 0x02 +#define NAN_OP_MODE_160MHZ 0x04 +#define NAN_OP_MODE_PNDL_SUPPRTED 0x08 + +/* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification + * Table 79 + */ +#define NAN_DEV_CAPA_DFS_OWNER 0x01 +#define NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED 0x02 +#define NAN_DEV_CAPA_SIM_NDP_RX_SUPPORTED 0x04 +#define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 +#define NAN_DEV_CAPA_S3_SUPPORTED 0x10 + #endif /* LINUX_IEEE80211_H */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1b10bd31bdd6..e30c1886c530 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5711,6 +5711,42 @@ struct wiphy_radio { u32 antenna_mask; }; +/** + * enum wiphy_nan_flags - NAN capabilities + * + * @WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC: Device supports NAN configurable + * synchronization. + * @WIPHY_NAN_FLAGS_USERSPACE_DE: Device doesn't support DE offload. + */ +enum wiphy_nan_flags { + WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC = BIT(0), + WIPHY_NAN_FLAGS_USERSPACE_DE = BIT(1), +}; + +/** + * struct wiphy_nan_capa - NAN capabilities + * + * This structure describes the NAN capabilities of a wiphy. + * + * @flags: NAN capabilities flags, see &enum wiphy_nan_flags + * @op_mode: NAN operation mode, as defined in Wi-Fi Aware (TM) specification + * Table 81. + * @n_antennas: number of antennas supported by the device for Tx/Rx. Lower + * nibble indicates the number of TX antennas and upper nibble indicates the + * number of RX antennas. Value 0 indicates the information is not + * available. + * @max_channel_switch_time: maximum channel switch time in milliseconds. + * @dev_capabilities: NAN device capabilities as defined in Wi-Fi Aware (TM) + * specification Table 79 (Capabilities field). + */ +struct wiphy_nan_capa { + u32 flags; + u8 op_mode; + u8 n_antennas; + u16 max_channel_switch_time; + u8 dev_capabilities; +}; + #define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff /** @@ -5884,6 +5920,7 @@ struct wiphy_radio { * bitmap of &enum nl80211_band values. For instance, for * NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @nan_capa: NAN capabilities * * @txq_limit: configuration of internal TX queue frame limit * @txq_memory_limit: configuration internal TX queue memory limit @@ -6065,6 +6102,7 @@ struct wiphy { u32 bss_select_support; u8 nan_supported_bands; + struct wiphy_nan_capa nan_capa; u32 txq_limit; u32 txq_memory_limit; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 904a725a4f4a..bcd18ae59e84 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2605,6 +2605,41 @@ fail: return -ENOBUFS; } +static int nl80211_put_nan_capa(struct wiphy *wiphy, struct sk_buff *msg) +{ + struct nlattr *nan_caps; + + nan_caps = nla_nest_start(msg, NL80211_ATTR_NAN_CAPABILITIES); + if (!nan_caps) + return -ENOBUFS; + + if (wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC && + nla_put_flag(msg, NL80211_NAN_CAPA_CONFIGURABLE_SYNC)) + goto fail; + + if ((wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE) && + nla_put_flag(msg, NL80211_NAN_CAPA_USERSPACE_DE)) + goto fail; + + if (nla_put_u8(msg, NL80211_NAN_CAPA_OP_MODE, + wiphy->nan_capa.op_mode) || + nla_put_u8(msg, NL80211_NAN_CAPA_NUM_ANTENNAS, + wiphy->nan_capa.n_antennas) || + nla_put_u16(msg, NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME, + wiphy->nan_capa.max_channel_switch_time) || + nla_put_u8(msg, NL80211_NAN_CAPA_CAPABILITIES, + wiphy->nan_capa.dev_capabilities)) + goto fail; + + nla_nest_end(msg, nan_caps); + + return 0; + +fail: + nla_nest_cancel(msg, nan_caps); + return -ENOBUFS; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -3257,6 +3292,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_put_radios(&rdev->wiphy, msg)) goto nla_put_failure; + state->split_start++; + break; + case 18: + if (nl80211_put_nan_capa(&rdev->wiphy, msg)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; -- cgit v1.2.3 From 31e7681da78d7e8d2d83185c0e640012a018f229 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:12 +1000 Subject: wifi: mac80211: correctly initialise S1G chandef for STA When moving to the APs channel, ensure we correctly initialise the chandef and perform the required validation. Additionally, if the AP is beaconing on a 2MHz primary, calculate the 2MHz primary center frequency by extracting the sibling 1MHz primary and averaging the frequencies to find the 2MHz primary center frequency. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-3-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 18 +++++++++++++++- net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/main.c | 6 ++++-- net/mac80211/mlme.c | 53 ++++++++++++++++++++++++++++++++++++++++------ net/mac80211/scan.c | 13 ++++++------ net/mac80211/util.c | 39 ++++++++++++++++++++++++++++------ 6 files changed, 109 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2110345de8ef..ddff9102f633 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1182,6 +1182,18 @@ enum ieee80211_s1g_chanwidth { IEEE80211_S1G_CHANWIDTH_16MHZ = 15, }; +/** + * enum ieee80211_s1g_pri_chanwidth - S1G primary channel widths + * described in IEEE80211-2024 Table 10-39. + * + * @IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: 2MHz primary channel + * @IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: 1MHz primary channel + */ +enum ieee80211_s1g_pri_chanwidth { + IEEE80211_S1G_PRI_CHANWIDTH_2MHZ = 0, + IEEE80211_S1G_PRI_CHANWIDTH_1MHZ = 1, +}; + #define WLAN_SA_QUERY_TR_ID_LEN 2 #define WLAN_MEMBERSHIP_LEN 8 #define WLAN_USER_POSITION_LEN 16 @@ -3170,8 +3182,12 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) -#define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ BIT(0) +#define S1G_OPER_CH_WIDTH_PRIMARY BIT(0) #define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) +#define S1G_OPER_CH_PRIMARY_LOCATION BIT(5) + +#define S1G_2M_PRIMARY_LOCATION_LOWER 0 +#define S1G_2M_PRIMARY_LOCATION_UPPER 1 /* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */ #define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS 0x01 diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 414058bced1a..73fd86ec1bce 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2710,7 +2710,8 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef); -bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, +bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, + const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef, struct ieee80211_conn_settings *conn); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 27b3ec5deabe..eefa6f7e899b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1249,11 +1249,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (!dflt_chandef.chan) { /* * Assign the first enabled channel to dflt_chandef - * from the list of channels + * from the list of channels. For S1G interfaces + * ensure it can be used as a primary. */ for (i = 0; i < sband->n_channels; i++) if (!(sband->channels[i].flags & - IEEE80211_CHAN_DISABLED)) + (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_S1G_NO_PRIMARY))) break; /* if none found then use the first anyway */ if (i == sband->n_channels) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0e12309accbe..3b5827ea438e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -180,10 +180,11 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, /* get special S1G case out of the way */ if (sband->band == NL80211_BAND_S1GHZ) { - if (!ieee80211_chandef_s1g_oper(elems->s1g_oper, chandef)) { - sdata_info(sdata, - "Missing S1G Operation Element? Trying operating == primary\n"); - chandef->width = ieee80211_s1g_channel_width(channel); + if (!ieee80211_chandef_s1g_oper(sdata->local, elems->s1g_oper, + chandef)) { + /* Fallback to default 1MHz */ + chandef->width = NL80211_CHAN_WIDTH_1; + chandef->s1g_primary_2mhz = false; } return IEEE80211_CONN_MODE_S1G; @@ -1046,6 +1047,14 @@ again: ret = -EINVAL; goto free; } + + chanreq->oper = *ap_chandef; + if (!cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, + IEEE80211_CHAN_DISABLED)) { + ret = -EINVAL; + goto free; + } + return elems; case NL80211_BAND_6GHZ: if (ap_mode < IEEE80211_CONN_MODE_HE) { @@ -7292,6 +7301,38 @@ static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata, return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len); } +static bool +ieee80211_rx_beacon_freq_valid(struct ieee80211_local *local, + struct ieee80211_mgmt *mgmt, + struct ieee80211_rx_status *rx_status, + struct ieee80211_chanctx_conf *chanctx) +{ + u32 pri_2mhz_khz; + struct ieee80211_channel *s1g_sibling_1mhz; + u32 pri_khz = ieee80211_channel_to_khz(chanctx->def.chan); + u32 rx_khz = ieee80211_rx_status_to_khz(rx_status); + + if (rx_khz == pri_khz) + return true; + + if (!chanctx->def.s1g_primary_2mhz) + return false; + + /* + * If we have an S1G interface with a 2MHz primary, beacons are + * sent on the center frequency of the 2MHz primary. Find the sibling + * 1MHz channel and calculate the 2MHz primary center frequency. + */ + s1g_sibling_1mhz = cfg80211_s1g_get_primary_sibling(local->hw.wiphy, + &chanctx->def); + if (!s1g_sibling_1mhz) + return false; + + pri_2mhz_khz = + (pri_khz + ieee80211_channel_to_khz(s1g_sibling_1mhz)) / 2; + return rx_khz == pri_2mhz_khz; +} + static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_hdr *hdr, size_t len, struct ieee80211_rx_status *rx_status) @@ -7346,8 +7387,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, return; } - if (ieee80211_rx_status_to_khz(rx_status) != - ieee80211_channel_to_khz(chanctx_conf->def.chan)) { + if (!ieee80211_rx_beacon_freq_valid(local, mgmt, rx_status, + chanctx_conf)) { rcu_read_unlock(); return; } diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index dbf98aa4cd67..bb9563f50e7b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -996,15 +996,15 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; - /* For scanning on the S1G band, detect the channel width according to - * the channel being scanned. - */ + /* For S1G, only scan the 1MHz primaries. */ if (chan->band == NL80211_BAND_S1GHZ) { - local->scan_chandef.width = ieee80211_s1g_channel_width(chan); + local->scan_chandef.width = NL80211_CHAN_WIDTH_1; + local->scan_chandef.s1g_primary_2mhz = false; goto set_channel; } - /* If scanning on oper channel, use whatever channel-type + /* + * If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->hw.conf.chandef.chan) @@ -1213,7 +1213,8 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || - band == NL80211_BAND_6GHZ) + band == NL80211_BAND_6GHZ || + band == NL80211_BAND_S1GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 123842b841f2..c9931537f9d2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3199,10 +3199,11 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, return true; } -bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, +bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, + const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef) { - u32 oper_freq; + u32 oper_khz, pri_1mhz_khz, pri_2mhz_khz; if (!oper) return false; @@ -3227,12 +3228,36 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, return false; } - oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch, - NL80211_BAND_S1GHZ); - chandef->center_freq1 = KHZ_TO_MHZ(oper_freq); - chandef->freq1_offset = oper_freq % 1000; + chandef->s1g_primary_2mhz = false; - return true; + switch (u8_get_bits(oper->ch_width, S1G_OPER_CH_WIDTH_PRIMARY)) { + case IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: + pri_1mhz_khz = ieee80211_channel_to_freq_khz( + oper->primary_ch, NL80211_BAND_S1GHZ); + break; + case IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: + chandef->s1g_primary_2mhz = true; + pri_2mhz_khz = ieee80211_channel_to_freq_khz( + oper->primary_ch, NL80211_BAND_S1GHZ); + + if (u8_get_bits(oper->ch_width, S1G_OPER_CH_PRIMARY_LOCATION) == + S1G_2M_PRIMARY_LOCATION_LOWER) + pri_1mhz_khz = pri_2mhz_khz - 500; + else + pri_1mhz_khz = pri_2mhz_khz + 500; + break; + default: + return false; + } + + oper_khz = ieee80211_channel_to_freq_khz(oper->oper_ch, + NL80211_BAND_S1GHZ); + chandef->center_freq1 = KHZ_TO_MHZ(oper_khz); + chandef->freq1_offset = oper_khz % 1000; + chandef->chan = + ieee80211_get_channel_khz(local->hw.wiphy, pri_1mhz_khz); + + return chandef->chan; } int ieee80211_put_srates_elem(struct sk_buff *skb, -- cgit v1.2.3 From e1b849cfa6b61f1c866a908c9e8dd9b5aaab820b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 9 Apr 2025 17:12:59 +0200 Subject: writeback: Avoid contention on wb->list_lock when switching inodes There can be multiple inode switch works that are trying to switch inodes to / from the same wb. This can happen in particular if some cgroup exits which owns many (thousands) inodes and we need to switch them all. In this case several inode_switch_wbs_work_fn() instances will be just spinning on the same wb->list_lock while only one of them makes forward progress. This wastes CPU cycles and quickly leads to softlockup reports and unusable system. Instead of running several inode_switch_wbs_work_fn() instances in parallel switching to the same wb and contending on wb->list_lock, run just one work item per wb and manage a queue of isw items switching to this wb. Acked-by: Tejun Heo Signed-off-by: Jan Kara --- fs/fs-writeback.c | 99 +++++++++++++++++++++++++--------------- include/linux/backing-dev-defs.h | 4 ++ include/linux/writeback.h | 2 + mm/backing-dev.c | 5 ++ 4 files changed, 74 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index cc57367fb641..b0e9092ccf04 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -368,7 +368,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct rcu_work work; + /* List of queued switching contexts for the wb */ + struct llist_node list; /* * Multiple inodes can be switched at once. The switching procedure @@ -378,7 +379,6 @@ struct inode_switch_wbs_context { * array embedded into struct inode_switch_wbs_context. Otherwise * an inode could be left in a non-consistent state. */ - struct bdi_writeback *new_wb; struct inode *inodes[]; }; @@ -486,13 +486,11 @@ skip_switch: return switched; } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static void process_inode_switch_wbs(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw) { - struct inode_switch_wbs_context *isw = - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; struct inode **inodep; @@ -543,6 +541,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) atomic_dec(&isw_nr_in_flight); } +void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback, + switch_work); + struct inode_switch_wbs_context *isw, *next_isw; + struct llist_node *list; + + /* + * Grab out reference to wb so that it cannot get freed under us + * after we process all the isw items. + */ + wb_get(new_wb); + while (1) { + list = llist_del_all(&new_wb->switch_wbs_ctxs); + /* Nothing to do? */ + if (!list) + break; + /* + * In addition to synchronizing among switchers, I_WB_SWITCH + * tells the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be + * visible. + */ + synchronize_rcu(); + + llist_for_each_entry_safe(isw, next_isw, list, list) + process_inode_switch_wbs(new_wb, isw); + } + wb_put(new_wb); +} + static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { @@ -572,6 +602,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode, return true; } +static void wb_queue_isw(struct bdi_writeback *wb, + struct inode_switch_wbs_context *isw) +{ + if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) + queue_work(isw_wq, &wb->switch_work); +} + /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -585,6 +622,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb = NULL; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) @@ -609,40 +647,34 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!memcg_css) goto out_free; - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); - if (!isw->new_wb) + if (!new_wb) goto out_free; - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) goto out_free; isw->inodes[0] = inode; - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + wb_queue_isw(new_wb, isw); return; out_free: atomic_dec(&isw_nr_in_flight); - if (isw->new_wb) - wb_put(isw->new_wb); + if (new_wb) + wb_put(new_wb); kfree(isw); } -static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, +static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw, struct list_head *list, int *nr) { struct inode *inode; list_for_each_entry(inode, list, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) continue; isw->inodes[*nr] = inode; @@ -666,6 +698,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb; int nr; bool restart = false; @@ -678,12 +711,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { - isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); - if (isw->new_wb) + new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (new_wb) break; } - if (unlikely(!isw->new_wb)) - isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + if (unlikely(!new_wb)) + new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; spin_lock(&wb->list_lock); @@ -695,27 +728,21 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * bandwidth restrictions, as writeback of inode metadata is not * accounted for. */ - restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr); if (!restart) - restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time, + &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ if (nr == 0) { atomic_dec(&isw_nr_in_flight); - wb_put(isw->new_wb); + wb_put(new_wb); kfree(isw); return restart; } - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + wb_queue_isw(new_wb, isw); return restart; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 2ad261082bba..c5c9d89c73ed 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -152,6 +152,10 @@ struct bdi_writeback { struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ struct list_head b_attached; /* attached inodes, protected by list_lock */ struct list_head offline_node; /* anchored at offline_cgwbs */ + struct work_struct switch_work; /* work used to perform inode switching + * to this wb */ + struct llist_head switch_wbs_ctxs; /* queued contexts for + * writeback switching */ union { struct work_struct release_work; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a2848d731a46..15a4bc4ab819 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -265,6 +265,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } +void inode_switch_wbs_work_fn(struct work_struct *work); + #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..0beaca6bacf7 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work) wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); + WARN_ON_ONCE(work_pending(&wb->switch_work)); call_rcu(&wb->rcu, cgwb_free_rcu); } @@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi, wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); + INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn); + init_llist_head(&wb->switch_wbs_ctxs); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); @@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; + INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn); + init_llist_head(&bdi->wb.switch_wbs_ctxs); } return ret; } -- cgit v1.2.3 From e3e1812f8e25ac277f5cc9249802365300c582e3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:28 +0200 Subject: ns: move to_ns_common() to ns_common.h Move the helper to ns_common.h where it belongs. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 20 ++++++++++++++++++++ include/linux/nsproxy.h | 11 ----------- 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7d22ea50b098..bc2e0758e1c9 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -6,6 +6,15 @@ struct proc_ns_operations; +struct cgroup_namespace; +struct ipc_namespace; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct time_namespace; +struct user_namespace; +struct uts_namespace; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -13,4 +22,15 @@ struct ns_common { refcount_t count; }; +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns) + #endif diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index dab6a1734a22..e6bec522b139 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -42,17 +42,6 @@ struct nsproxy { }; extern struct nsproxy init_nsproxy; -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns->ns), \ - struct ipc_namespace *: &(__ns->ns), \ - struct net *: &(__ns->ns), \ - struct pid_namespace *: &(__ns->ns), \ - struct mnt_namespace *: &(__ns->ns), \ - struct time_namespace *: &(__ns->ns), \ - struct user_namespace *: &(__ns->ns), \ - struct uts_namespace *: &(__ns->ns)) - /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. -- cgit v1.2.3 From 9296f46a9645cf753d2522093485cebe77635aa6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:29 +0200 Subject: nsfs: add nsfs.h header And move the stuff out from proc_ns.h where it really doesn't belong. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/nsfs.h | 26 ++++++++++++++++++++++++++ include/linux/proc_ns.h | 13 +------------ 2 files changed, 27 insertions(+), 12 deletions(-) create mode 100644 include/linux/nsfs.h (limited to 'include/linux') diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h new file mode 100644 index 000000000000..fb84aa538091 --- /dev/null +++ b/include/linux/nsfs.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner */ + +#ifndef _LINUX_NSFS_H +#define _LINUX_NSFS_H + +#include + +struct path; +struct task_struct; +struct proc_ns_operations; + +int ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +typedef struct ns_common *ns_get_path_helper_t(void *); +int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, + void *private_data); + +bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); + +int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops); +void nsfs_init(void); + +#endif /* _LINUX_NSFS_H */ + diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 4b20375f3783..5e1a4b378b79 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -5,7 +5,7 @@ #ifndef _LINUX_PROC_NS_H #define _LINUX_PROC_NS_H -#include +#include #include struct pid_namespace; @@ -75,16 +75,5 @@ static inline int ns_alloc_inum(struct ns_common *ns) #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) -extern int ns_get_path(struct path *path, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -typedef struct ns_common *ns_get_path_helper_t(void *); -extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, - void *private_data); - -extern bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino); - -extern int ns_get_name(char *buf, size_t size, struct task_struct *task, - const struct proc_ns_operations *ns_ops); -extern void nsfs_init(void); #endif /* _LINUX_PROC_NS_H */ -- cgit v1.2.3 From 660def10b01b248fd97255afacb7b0e305ac833a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:30 +0200 Subject: ns: uniformly initialize ns_common No point in cargo-culting the same code across all the different types. Use one common initializer. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 5e1a4b378b79..dbb119bda097 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -72,6 +72,22 @@ static inline int ns_alloc_inum(struct ns_common *ns) return proc_alloc_inum(&ns->inum); } +static inline int ns_common_init(struct ns_common *ns, + const struct proc_ns_operations *ops, + bool alloc_inum) +{ + if (alloc_inum) { + int ret; + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + } + refcount_set(&ns->count, 1); + ns->stashed = NULL; + ns->ops = ops; + return 0; +} + #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) -- cgit v1.2.3 From 86c5aba210b145d7de011a5abaf9b785aa70a183 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:39 +0200 Subject: ns: remove ns_alloc_inum() It's now unused. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/proc_ns.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index dbb119bda097..e50d312f9fee 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,12 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -static inline int ns_alloc_inum(struct ns_common *ns) -{ - WRITE_ONCE(ns->stashed, NULL); - return proc_alloc_inum(&ns->inum); -} - static inline int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, bool alloc_inum) -- cgit v1.2.3 From 885fc8ac0a4dc70f5d87b80b0977292870e35c60 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:40 +0200 Subject: nstree: make iterator generic Move the namespace iteration infrastructure originally introduced for mount namespaces into a generic library usable by all namespace types. Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 9 ++ include/linux/nstree.h | 91 ++++++++++++++++++ include/linux/proc_ns.h | 3 + kernel/Makefile | 2 +- kernel/nstree.c | 233 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 include/linux/nstree.h create mode 100644 kernel/nstree.c (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index bc2e0758e1c9..7224072cccc5 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -3,6 +3,7 @@ #define _LINUX_NS_COMMON_H #include +#include struct proc_ns_operations; @@ -20,6 +21,14 @@ struct ns_common { const struct proc_ns_operations *ops; unsigned int inum; refcount_t count; + union { + struct { + u64 ns_id; + struct rb_node ns_tree_node; + struct list_head ns_list_node; + }; + struct rcu_head ns_rcu; + }; }; #define to_ns_common(__ns) \ diff --git a/include/linux/nstree.h b/include/linux/nstree.h new file mode 100644 index 000000000000..29ad6402260c --- /dev/null +++ b/include/linux/nstree.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NSTREE_H +#define _LINUX_NSTREE_H + +#include +#include +#include +#include +#include +#include + +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + +extern struct ns_tree cgroup_ns_tree; +extern struct ns_tree ipc_ns_tree; +extern struct ns_tree mnt_ns_tree; +extern struct ns_tree net_ns_tree; +extern struct ns_tree pid_ns_tree; +extern struct ns_tree time_ns_tree; +extern struct ns_tree user_ns_tree; +extern struct ns_tree uts_ns_tree; + +#define to_ns_tree(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(cgroup_ns_tree), \ + struct ipc_namespace *: &(ipc_ns_tree), \ + struct net *: &(net_ns_tree), \ + struct pid_namespace *: &(pid_ns_tree), \ + struct mnt_namespace *: &(mnt_ns_tree), \ + struct time_namespace *: &(time_ns_tree), \ + struct user_namespace *: &(user_ns_tree), \ + struct uts_namespace *: &(uts_ns_tree)) + +u64 ns_tree_gen_id(struct ns_common *ns); +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, + bool previous); + +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) +{ + ns_tree_gen_id(ns); + __ns_tree_add_raw(ns, ns_tree); +} + +/** + * ns_tree_add_raw - Add a namespace to a namespace + * @ns: Namespace to add + * + * This function adds a namespace to the appropriate namespace tree + * without assigning a id. + */ +#define ns_tree_add_raw(__ns) __ns_tree_add_raw(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_add - Add a namespace to a namespace tree + * @ns: Namespace to add + * + * This function assigns a new id to the namespace and adds it to the + * appropriate namespace tree and list. + */ +#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) + +/** + * ns_tree_remove - Remove a namespace from a namespace tree + * @ns: Namespace to remove + * + * This function removes a namespace from the appropriate namespace + * tree and list. + */ +#define ns_tree_remove(__ns) __ns_tree_remove(to_ns_common(__ns), to_ns_tree(__ns)) + +#define ns_tree_adjoined_rcu(__ns, __previous) \ + __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) + +#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) + +#endif /* _LINUX_NSTREE_H */ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index e50d312f9fee..7f89f0829e60 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -79,6 +79,9 @@ static inline int ns_common_init(struct ns_common *ns, refcount_set(&ns->count, 1); ns->stashed = NULL; ns->ops = ops; + ns->ns_id = 0; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); return 0; } diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235..b807516a1b43 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ + kthread.o sys_ni.o nsproxy.o nstree.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o diff --git a/kernel/nstree.c b/kernel/nstree.c new file mode 100644 index 000000000000..bbe8bedc924c --- /dev/null +++ b/kernel/nstree.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +struct ns_tree mnt_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), + .type = CLONE_NEWNS, +}; + +struct ns_tree net_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), + .type = CLONE_NEWNET, +}; +EXPORT_SYMBOL_GPL(net_ns_tree); + +struct ns_tree uts_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), + .type = CLONE_NEWUTS, +}; + +struct ns_tree user_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), + .type = CLONE_NEWUSER, +}; + +struct ns_tree ipc_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), + .type = CLONE_NEWIPC, +}; + +struct ns_tree pid_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), + .type = CLONE_NEWPID, +}; + +struct ns_tree cgroup_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), + .type = CLONE_NEWCGROUP, +}; + +struct ns_tree time_ns_tree = { + .ns_tree = RB_ROOT, + .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), + .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), + .type = CLONE_NEWTIME, +}; + +DEFINE_COOKIE(namespace_cookie); + +static inline struct ns_common *node_to_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_tree_node); +} + +static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct ns_common *ns_a = node_to_ns(a); + struct ns_common *ns_b = node_to_ns(b); + u64 ns_id_a = ns_a->ns_id; + u64 ns_id_b = ns_b->ns_id; + + if (ns_id_a < ns_id_b) + return -1; + if (ns_id_a > ns_id_b) + return 1; + return 0; +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +{ + struct rb_node *node, *prev; + + VFS_WARN_ON_ONCE(!ns->ns_id); + + write_seqlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + + node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->ns_tree_node); + if (!prev) + list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); + else + list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + + write_sequnlock(&ns_tree->ns_tree_lock); + + VFS_WARN_ON_ONCE(node); +} + +void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +{ + VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); + VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + + write_seqlock(&ns_tree->ns_tree_lock); + rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); + list_bidir_del_rcu(&ns->ns_list_node); + RB_CLEAR_NODE(&ns->ns_tree_node); + write_sequnlock(&ns_tree->ns_tree_lock); +} +EXPORT_SYMBOL_GPL(__ns_tree_remove); + +static int ns_find(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns(node); + + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + + +static struct ns_tree *ns_tree_from_type(int ns_type) +{ + switch (ns_type) { + case CLONE_NEWCGROUP: + return &cgroup_ns_tree; + case CLONE_NEWIPC: + return &ipc_ns_tree; + case CLONE_NEWNS: + return &mnt_ns_tree; + case CLONE_NEWNET: + return &net_ns_tree; + case CLONE_NEWPID: + return &pid_ns_tree; + case CLONE_NEWUSER: + return &user_ns_tree; + case CLONE_NEWUTS: + return &uts_ns_tree; + case CLONE_NEWTIME: + return &time_ns_tree; + } + + return NULL; +} + +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree *ns_tree; + struct rb_node *node; + unsigned int seq; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + + do { + seq = read_seqbegin(&ns_tree->ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + if (node) + break; + } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + + if (!node) + return NULL; + + VFS_WARN_ON_ONCE(node_to_ns(node)->ops->type != ns_type); + + return node_to_ns(node); +} + +/** + * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * tree + * @ns: namespace to start from + * @previous: if true find the previous namespace, otherwise the next + * + * Find the next or previous namespace in the same tree as @ns. If + * there is no next/previous namespace, -ENOENT is returned. + */ +struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, + struct ns_tree *ns_tree, bool previous) +{ + struct list_head *list; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); + + if (previous) + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + else + list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); + if (list_is_head(list, &ns_tree->ns_list)) + return ERR_PTR(-ENOENT); + + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ops->type != ns_tree->type); + + return list_entry_rcu(list, struct ns_common, ns_list_node); +} + +/** + * ns_tree_gen_id - generate a new namespace id + * @ns: namespace to generate id for + * + * Generates a new namespace id and assigns it to the namespace. All + * namespaces types share the same id space and thus can be compared + * directly. IOW, when two ids of two namespace are equal, they are + * identical. + */ +u64 ns_tree_gen_id(struct ns_common *ns) +{ + guard(preempt)(); + ns->ns_id = gen_cookie_next(&namespace_cookie); + return ns->ns_id; +} -- cgit v1.2.3 From b36c823b9a4be5b0c8e38c3fd60cade7d41c216c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:46 +0200 Subject: time: support ns lookup Support the generic ns lookup infrastructure to support file handles for namespaces. Reviewed-by: Thomas Gleixner Signed-off-by: Christian Brauner --- include/linux/time_namespace.h | 5 +++++ init/main.c | 2 ++ kernel/time/namespace.c | 11 ++++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb2c52f4fc94..7f6af7a9771e 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,7 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); @@ -108,6 +109,10 @@ static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim) } #else +static inline void __init time_ns_init(void) +{ +} + static inline int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { diff --git a/init/main.c b/init/main.c index 0ee0ee7b7c2c..e7d2c57c65a7 100644 --- a/init/main.c +++ b/init/main.c @@ -103,6 +103,7 @@ #include #include #include +#include #include #include @@ -1072,6 +1073,7 @@ void start_kernel(void) fork_init(); proc_caches_init(); uts_ns_init(); + time_ns_init(); key_init(); security_init(); dbg_late_init(); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0be93d8f2896..408f60d0a3b6 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -104,6 +105,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: @@ -250,11 +252,13 @@ out: void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); __free_page(ns->vvar_page); - kfree(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct time_namespace *to_time_ns(struct ns_common *ns) @@ -487,3 +491,8 @@ struct time_namespace init_time_ns = { .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} -- cgit v1.2.3 From d7afdf889561058068ab46fd8f306c70ef29216a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:49 +0200 Subject: ns: add to__ns() to respective headers Every namespace type has a container_of(ns, , ns) static inline function that is currently not exposed in the header. So we have a bunch of places that open-code it via container_of(). Move it to the headers so we can use it directly. Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 5 +++++ include/linux/ipc_namespace.h | 5 +++++ include/linux/pid_namespace.h | 5 +++++ include/linux/time_namespace.h | 4 ++++ include/linux/user_namespace.h | 5 +++++ include/linux/utsname.h | 5 +++++ include/net/net_namespace.h | 5 +++++ ipc/namespace.c | 5 ----- kernel/cgroup/namespace.c | 5 ----- kernel/pid_namespace.c | 5 ----- kernel/time/namespace.c | 5 ----- kernel/user_namespace.c | 5 ----- kernel/utsname.c | 5 ----- net/core/net_namespace.c | 5 ----- 14 files changed, 34 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e..9ca25346f7cb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -794,6 +794,11 @@ extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e8240cf2611a..924e4754374f 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -129,6 +129,11 @@ static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif #if defined(CONFIG_IPC_NS) +static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) +{ + return container_of(ns, struct ipc_namespace, ns); +} + extern struct ipc_namespace *copy_ipcs(unsigned long flags, struct user_namespace *user_ns, struct ipc_namespace *ns); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..ba0efc8c8596 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -54,6 +54,11 @@ extern struct pid_namespace init_pid_ns; #define PIDNS_ADDING (1U << 31) #ifdef CONFIG_PID_NS +static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) +{ + return container_of(ns, struct pid_namespace, ns); +} + static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 7f6af7a9771e..a47a4ce4183e 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -33,6 +33,10 @@ struct time_namespace { extern struct time_namespace init_time_ns; #ifdef CONFIG_TIME_NS +static inline struct time_namespace *to_time_ns(struct ns_common *ns) +{ + return container_of(ns, struct time_namespace, ns); +} void __init time_ns_init(void); extern int vdso_join_timens(struct task_struct *task, struct time_namespace *ns); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a0bb6d012137..a09056ad090e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -168,6 +168,11 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, #ifdef CONFIG_USER_NS +static inline struct user_namespace *to_user_ns(struct ns_common *ns) +{ + return container_of(ns, struct user_namespace, ns); +} + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) diff --git a/include/linux/utsname.h b/include/linux/utsname.h index bf7613ba412b..5d34c4f0f945 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -30,6 +30,11 @@ struct uts_namespace { extern struct uts_namespace init_uts_ns; #ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + static inline void get_uts_ns(struct uts_namespace *ns) { refcount_inc(&ns->ns.count); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 025a7574b275..fd090ceb80bf 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -262,6 +262,11 @@ void ipx_unregister_sysctl(void); #ifdef CONFIG_NET_NS void __put_net(struct net *net); +static inline struct net *to_net_ns(struct ns_common *ns) +{ + return container_of(ns, struct net, ns); +} + /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { diff --git a/ipc/namespace.c b/ipc/namespace.c index 9f923c1a1eb3..89588819956b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -209,11 +209,6 @@ void put_ipc_ns(struct ipc_namespace *ns) } } -static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) -{ - return container_of(ns, struct ipc_namespace, ns); -} - static struct ns_common *ipcns_get(struct task_struct *task) { struct ipc_namespace *ns = NULL; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index fc12c416dfeb..5a327914b565 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -89,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return new_ns; } -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 228ae20299f9..9b327420309e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -345,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 408f60d0a3b6..20b65f90549e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -261,11 +261,6 @@ void free_time_ns(struct time_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); -} - static struct ns_common *timens_get(struct task_struct *task) { struct time_namespace *ns = NULL; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index ade5b6806c5c..cfb0e28f2779 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1325,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns) } EXPORT_SYMBOL(current_in_userns); -static inline struct user_namespace *to_user_ns(struct ns_common *ns) -{ - return container_of(ns, struct user_namespace, ns); -} - static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; diff --git a/kernel/utsname.c b/kernel/utsname.c index 64155417ae0c..a682830742d3 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -103,11 +103,6 @@ void free_uts_ns(struct uts_namespace *ns) kfree_rcu(ns, ns.ns_rcu); } -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); -} - static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 169ec22c4758..a57b3cda8dbc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1541,11 +1541,6 @@ static struct ns_common *netns_get(struct task_struct *task) return net ? &net->ns : NULL; } -static inline struct net *to_net_ns(struct ns_common *ns) -{ - return container_of(ns, struct net, ns); -} - static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); -- cgit v1.2.3 From d2afdb73f8ad77b49eca9d110d0c54bf30d1df0f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:50 +0200 Subject: nsfs: add current_in_namespace() Add a helper to easily check whether a given namespace is the caller's current namespace. This is currently open-coded in a lot of places. Simply switch on the type and compare the results. Reviewed-by: Aleksa Sarai Signed-off-by: Christian Brauner --- include/linux/nsfs.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h index fb84aa538091..e5a5fa83d36b 100644 --- a/include/linux/nsfs.h +++ b/include/linux/nsfs.h @@ -5,6 +5,8 @@ #define _LINUX_NSFS_H #include +#include +#include struct path; struct task_struct; @@ -22,5 +24,17 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, const struct proc_ns_operations *ns_ops); void nsfs_init(void); -#endif /* _LINUX_NSFS_H */ +#define __current_namespace_from_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: current->nsproxy->cgroup_ns, \ + struct ipc_namespace *: current->nsproxy->ipc_ns, \ + struct net *: current->nsproxy->net_ns, \ + struct pid_namespace *: task_active_pid_ns(current), \ + struct mnt_namespace *: current->nsproxy->mnt_ns, \ + struct time_namespace *: current->nsproxy->time_ns, \ + struct user_namespace *: current_user_ns(), \ + struct uts_namespace *: current->nsproxy->uts_ns) + +#define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) +#endif /* _LINUX_NSFS_H */ -- cgit v1.2.3 From 5222470b2fbb3740f931f189db33dd1367b1ae75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:51 +0200 Subject: nsfs: support file handles A while ago we added support for file handles to pidfs so pidfds can be encoded and decoded as file handles. Userspace has adopted this quickly and it's proven very useful. Implement file handles for namespaces as well. A process is not always able to open /proc/self/ns/. That requires procfs to be mounted and for /proc/self/ or /proc/self/ns/ to not be overmounted. However, userspace can always derive a namespace fd from a pidfd. And that always works for a task's own namespace. There's no need to introduce unnecessary behavioral differences between /proc/self/ns/ fds, pidfd-derived namespace fds, and file-handle-derived namespace fds. So namespace file handles are always decodable if the caller is located in the namespace the file handle refers to. This also allows a task to e.g., store a set of file handles to its namespaces in a file on-disk so it can verify when it gets rexeced that they're still valid and so on. This is akin to the pidfd use-case. Or just plainly for namespace comparison reasons where a file handle to the task's own namespace can be easily compared against others. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/nsfs.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/exportfs.h | 6 ++ include/uapi/linux/nsfs.h | 9 +++ 3 files changed, 173 insertions(+) (limited to 'include/linux') diff --git a/fs/nsfs.c b/fs/nsfs.c index 80e631aeb3ce..926e2680414e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,6 +13,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "mount.h" #include "internal.h" @@ -417,12 +423,164 @@ static const struct stashed_operations nsfs_stashed_ops = { .put_data = nsfs_put_data, }; +#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) +#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) + +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct ns_common *ns = inode->i_private; + int len = *max_len; + + if (parent) + return FILEID_INVALID; + + if (len < NSFS_FID_SIZE_U32_VER0) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + return FILEID_INVALID; + } else if (len > NSFS_FID_SIZE_U32_LATEST) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + } + + fid->ns_id = ns->ns_id; + fid->ns_type = ns->ops->type; + fid->ns_inum = inode->i_ino; + return FILEID_NSFS; +} + +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct path path __free(path_put) = {}; + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct user_namespace *owning_ns = NULL; + struct ns_common *ns; + int ret; + + if (fh_len < NSFS_FID_SIZE_U32_VER0) + return NULL; + + /* Check that any trailing bytes are zero. */ + if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && + memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, + fh_len - NSFS_FID_SIZE_U32_LATEST)) + return NULL; + + switch (fh_type) { + case FILEID_NSFS: + break; + default: + return NULL; + } + + scoped_guard(rcu) { + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); + if (!ns) + return NULL; + + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); + VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (!refcount_inc_not_zero(&ns->count)) + return NULL; + } + + switch (ns->ops->type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + if (!current_in_namespace(to_cg_ns(ns))) + owning_ns = to_cg_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + if (!current_in_namespace(to_ipc_ns(ns))) + owning_ns = to_ipc_ns(ns)->user_ns; + break; +#endif + case CLONE_NEWNS: + if (!current_in_namespace(to_mnt_ns(ns))) + owning_ns = to_mnt_ns(ns)->user_ns; + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + if (!current_in_namespace(to_net_ns(ns))) + owning_ns = to_net_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + if (!current_in_namespace(to_pid_ns(ns))) { + owning_ns = to_pid_ns(ns)->user_ns; + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + if (!current_in_namespace(to_time_ns(ns))) + owning_ns = to_time_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + if (!current_in_namespace(to_user_ns(ns))) + owning_ns = to_user_ns(ns); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + if (!current_in_namespace(to_uts_ns(ns))) + owning_ns = to_uts_ns(ns)->user_ns; + break; +#endif + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + + /* path_from_stashed() unconditionally consumes the reference. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ERR_PTR(ret); + + return no_free_ptr(path.dentry); +} + +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + /* nsfs_fh_to_dentry() performs all permission checks. */ + return 0; +} + +static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +{ + return file_open_root(path, "", oflags, 0); +} + +static const struct export_operations nsfs_export_operations = { + .encode_fh = nsfs_encode_fh, + .fh_to_dentry = nsfs_fh_to_dentry, + .open = nsfs_export_open, + .permission = nsfs_export_permission, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; + ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cfb0dd1ea49c..3aac58a520c7 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -122,6 +122,12 @@ enum fid_type { FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, FILEID_BCACHEFS_WITH_PARENT = 0xb2, + /* + * + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. + */ + FILEID_NSFS = 0xf1, + /* * 64 bit unique kernfs id */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 97d8d80d139f..fa86fe3c8bd3 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,4 +53,13 @@ enum init_ns_ino { MNT_NS_INIT_INO = 0xEFFFFFF8U, }; +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; +}; + +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + #endif /* __LINUX_NSFS_H */ -- cgit v1.2.3 From 93f67a7ddadf6ed8997c000df9790e5d64617196 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:00 +0200 Subject: uts: split namespace into separate header We have dedicated headers for all namespace types. Add one for the uts namespace as well. Now it's consistent for all namespace types. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/uts_namespace.h | 65 +++++++++++++++++++++++++++++++++++++++++++ include/linux/utsname.h | 58 +------------------------------------- 2 files changed, 66 insertions(+), 57 deletions(-) create mode 100644 include/linux/uts_namespace.h (limited to 'include/linux') diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h new file mode 100644 index 000000000000..c2b619bb4e57 --- /dev/null +++ b/include/linux/uts_namespace.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_UTS_NAMESPACE_H +#define _LINUX_UTS_NAMESPACE_H + +#include +#include + +struct user_namespace; +extern struct user_namespace init_user_ns; + +struct uts_namespace { + struct new_utsname name; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct ns_common ns; +} __randomize_layout; + +extern struct uts_namespace init_uts_ns; + +#ifdef CONFIG_UTS_NS +static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) +{ + return container_of(ns, struct uts_namespace, ns); +} + +static inline void get_uts_ns(struct uts_namespace *ns) +{ + refcount_inc(&ns->ns.count); +} + +extern struct uts_namespace *copy_utsname(unsigned long flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns); +extern void free_uts_ns(struct uts_namespace *ns); + +static inline void put_uts_ns(struct uts_namespace *ns) +{ + if (refcount_dec_and_test(&ns->ns.count)) + free_uts_ns(ns); +} + +void uts_ns_init(void); +#else +static inline void get_uts_ns(struct uts_namespace *ns) +{ +} + +static inline void put_uts_ns(struct uts_namespace *ns) +{ +} + +static inline struct uts_namespace *copy_utsname(unsigned long flags, + struct user_namespace *user_ns, struct uts_namespace *old_ns) +{ + if (flags & CLONE_NEWUTS) + return ERR_PTR(-EINVAL); + + return old_ns; +} + +static inline void uts_ns_init(void) +{ +} +#endif + +#endif /* _LINUX_UTS_NAMESPACE_H */ diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 5d34c4f0f945..547bd4439706 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include enum uts_proc { UTS_PROC_ARCH, @@ -18,62 +18,6 @@ enum uts_proc { UTS_PROC_DOMAINNAME, }; -struct user_namespace; -extern struct user_namespace init_user_ns; - -struct uts_namespace { - struct new_utsname name; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct ns_common ns; -} __randomize_layout; -extern struct uts_namespace init_uts_ns; - -#ifdef CONFIG_UTS_NS -static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) -{ - return container_of(ns, struct uts_namespace, ns); -} - -static inline void get_uts_ns(struct uts_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -extern struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns); -extern void free_uts_ns(struct uts_namespace *ns); - -static inline void put_uts_ns(struct uts_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_uts_ns(ns); -} - -void uts_ns_init(void); -#else -static inline void get_uts_ns(struct uts_namespace *ns) -{ -} - -static inline void put_uts_ns(struct uts_namespace *ns) -{ -} - -static inline struct uts_namespace *copy_utsname(unsigned long flags, - struct user_namespace *user_ns, struct uts_namespace *old_ns) -{ - if (flags & CLONE_NEWUTS) - return ERR_PTR(-EINVAL); - - return old_ns; -} - -static inline void uts_ns_init(void) -{ -} -#endif - #ifdef CONFIG_PROC_SYSCTL extern void uts_proc_notify(enum uts_proc proc); #else -- cgit v1.2.3 From b2a0b192084acd0a86d66cbbc61e17ba1f5bd583 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:01 +0200 Subject: mnt: expose pointer to init_mnt_ns There's various scenarios where we need to know whether we are in the initial set of namespaces or not to e.g., shortcut permission checking. All namespaces expose that information. Let's do that too. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 27 ++++++++++++++++----------- include/linux/mnt_namespace.h | 2 ++ 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index a68998449698..f0bddc9cf2a6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6008,27 +6008,32 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, return ret; } +struct mnt_namespace init_mnt_ns = { + .ns.inum = PROC_MNT_INIT_INO, + .ns.ops = &mntns_operations, + .user_ns = &init_user_ns, + .ns.count = REFCOUNT_INIT(1), + .passive = REFCOUNT_INIT(1), + .mounts = RB_ROOT, + .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), +}; + static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; - struct mnt_namespace *ns; struct path root; mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, true); - if (IS_ERR(ns)) - panic("Can't allocate initial namespace"); - ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); - ns->root = m; - ns->nr_mounts = 1; - mnt_add_to_ns(ns, m); - init_task.nsproxy->mnt_ns = ns; - get_mnt_ns(ns); + init_mnt_ns.root = m; + init_mnt_ns.nr_mounts = 1; + mnt_add_to_ns(&init_mnt_ns, m); + init_task.nsproxy->mnt_ns = &init_mnt_ns; + get_mnt_ns(&init_mnt_ns); root.mnt = mnt; root.dentry = mnt->mnt_root; @@ -6036,7 +6041,7 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); - ns_tree_add(ns); + ns_tree_add(&init_mnt_ns); } void __init mnt_init(void) diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 70b366b64816..6d1c4c218c14 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -11,6 +11,8 @@ struct fs_struct; struct user_namespace; struct ns_common; +extern struct mnt_namespace init_mnt_ns; + extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); -- cgit v1.2.3 From f74ca6da113d5d4b21c00bb4da3f3c137162b4fe Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:02 +0200 Subject: nscommon: move to separate file It's really awkward spilling the ns common infrastructure into multiple headers. Move it to a separate file. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 3 +++ include/linux/proc_ns.h | 19 ------------------- kernel/Makefile | 2 +- kernel/nscommon.c | 21 +++++++++++++++++++++ 4 files changed, 25 insertions(+), 20 deletions(-) create mode 100644 kernel/nscommon.c (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7224072cccc5..78b17fe80b62 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -31,6 +31,9 @@ struct ns_common { }; }; +int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, + bool alloc_inum); + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 7f89f0829e60..9f21670b5824 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,25 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -static inline int ns_common_init(struct ns_common *ns, - const struct proc_ns_operations *ops, - bool alloc_inum) -{ - if (alloc_inum) { - int ret; - ret = proc_alloc_inum(&ns->inum); - if (ret) - return ret; - } - refcount_set(&ns->count, 1); - ns->stashed = NULL; - ns->ops = ops; - ns->ns_id = 0; - RB_CLEAR_NODE(&ns->ns_tree_node); - INIT_LIST_HEAD(&ns->ns_list_node); - return 0; -} - #define ns_free_inum(ns) proc_free_inum((ns)->inum) #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) diff --git a/kernel/Makefile b/kernel/Makefile index b807516a1b43..1f48f7cd2d7b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ extable.o params.o \ - kthread.o sys_ni.o nsproxy.o nstree.o \ + kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o diff --git a/kernel/nscommon.c b/kernel/nscommon.c new file mode 100644 index 000000000000..ebf4783d0505 --- /dev/null +++ b/kernel/nscommon.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include + +int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, + bool alloc_inum) +{ + if (alloc_inum) { + int ret; + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + } + refcount_set(&ns->count, 1); + ns->stashed = NULL; + ns->ops = ops; + ns->ns_id = 0; + RB_CLEAR_NODE(&ns->ns_tree_node); + INIT_LIST_HEAD(&ns->ns_list_node); + return 0; +} -- cgit v1.2.3 From 5fc6bef178f1b644f1439e520c8f83bfc83a1252 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:03 +0200 Subject: cgroup: split namespace into separate header We have dedicated headers for all namespace types. Add one for the cgroup namespace as well. Now it's consistent for all namespace types and easy to figure out what to include. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 51 +----------------------------------- include/linux/cgroup_namespace.h | 56 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 50 deletions(-) create mode 100644 include/linux/cgroup_namespace.h (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9ca25346f7cb..5156fed8cbc3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -27,6 +27,7 @@ #include #include +#include struct kernel_clone_args; @@ -783,56 +784,6 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ -struct cgroup_namespace { - struct ns_common ns; - struct user_namespace *user_ns; - struct ucounts *ucounts; - struct css_set *root_cset; -}; - -extern struct cgroup_namespace init_cgroup_ns; - -#ifdef CONFIG_CGROUPS - -static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) -{ - return container_of(ns, struct cgroup_namespace, ns); -} - -void free_cgroup_ns(struct cgroup_namespace *ns); - -struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, - struct user_namespace *user_ns, - struct cgroup_namespace *old_ns); - -int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, - struct cgroup_namespace *ns); - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) -{ - refcount_inc(&ns->ns.count); -} - -static inline void put_cgroup_ns(struct cgroup_namespace *ns) -{ - if (refcount_dec_and_test(&ns->ns.count)) - free_cgroup_ns(ns); -} - -#else /* !CONFIG_CGROUPS */ - -static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } -static inline struct cgroup_namespace * -copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, - struct cgroup_namespace *old_ns) -{ - return old_ns; -} - -static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } -static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } - -#endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h new file mode 100644 index 000000000000..c02bb76c5e32 --- /dev/null +++ b/include/linux/cgroup_namespace.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CGROUP_NAMESPACE_H +#define _LINUX_CGROUP_NAMESPACE_H + +struct cgroup_namespace { + struct ns_common ns; + struct user_namespace *user_ns; + struct ucounts *ucounts; + struct css_set *root_cset; +}; + +extern struct cgroup_namespace init_cgroup_ns; + +#ifdef CONFIG_CGROUPS + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +void free_cgroup_ns(struct cgroup_namespace *ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns); + +int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns); + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) +{ + refcount_inc(&ns->ns.count); +} + +static inline void put_cgroup_ns(struct cgroup_namespace *ns) +{ + if (refcount_dec_and_test(&ns->ns.count)) + free_cgroup_ns(ns); +} + +#else /* !CONFIG_CGROUPS */ + +static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } +static inline struct cgroup_namespace * +copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + return old_ns; +} + +static inline void get_cgroup_ns(struct cgroup_namespace *ns) { } +static inline void put_cgroup_ns(struct cgroup_namespace *ns) { } + +#endif /* !CONFIG_CGROUPS */ + +#endif /* _LINUX_CGROUP_NAMESPACE_H */ -- cgit v1.2.3 From 5612ff3ec588be09f11a9424db6d1186bcdeb3fa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:07 +0200 Subject: nscommon: simplify initialization There's a lot of information that namespace implementers don't need to know about at all. Encapsulate this all in the initialization helper. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 5 +++-- include/linux/ns_common.h | 39 +++++++++++++++++++++++++++++++++++++-- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/nscommon.c | 17 ++++++++--------- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- 10 files changed, 55 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index b2fcb901ad8c..699b8c770c47 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4104,8 +4104,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } if (anon) - new_ns->ns.inum = MNT_NS_ANON_INO; - ret = ns_common_init(&new_ns->ns, &mntns_operations, !anon); + ret = ns_common_init_inum(new_ns, &mntns_operations, MNT_NS_ANON_INO); + else + ret = ns_common_init(new_ns, &mntns_operations); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 78b17fe80b62..05c7a7dd211b 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -16,6 +16,15 @@ struct time_namespace; struct user_namespace; struct uts_namespace; +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -31,8 +40,7 @@ struct ns_common { }; }; -int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, - bool alloc_inum); +int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); #define to_ns_common(__ns) \ _Generic((__ns), \ @@ -45,4 +53,31 @@ int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, struct user_namespace *: &(__ns)->ns, \ struct uts_namespace *: &(__ns)->ns) +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define ns_common_init(__ns, __ops) \ + __ns_common_init(to_ns_common(__ns), __ops, (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) + #endif diff --git a/ipc/namespace.c b/ipc/namespace.c index 89588819956b..0f8bbd18a475 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -62,7 +62,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_common_init(&ns->ns, &ipcns_operations, true); + err = ns_common_init(ns, &ipcns_operations); if (err) goto fail_free; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 5a327914b565..d928c557e28b 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -27,7 +27,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_common_init(&new_ns->ns, &cgroupns_operations, true); + ret = ns_common_init(new_ns, &cgroupns_operations); if (ret) return ERR_PTR(ret); ns_tree_add(new_ns); diff --git a/kernel/nscommon.c b/kernel/nscommon.c index e10fad8afe61..c3a90bb665ad 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,21 +1,20 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include -int ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, - bool alloc_inum) +int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { - if (alloc_inum && !ns->inum) { - int ret; - ret = proc_alloc_inum(&ns->inum); - if (ret) - return ret; - } refcount_set(&ns->count, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); - return 0; + + if (inum) { + ns->inum = inum; + return 0; + } + return proc_alloc_inum(&ns->inum); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 9b327420309e..170757c265c2 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -103,7 +103,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_common_init(&ns->ns, &pidns_operations, true); + err = ns_common_init(ns, &pidns_operations); if (err) goto out_free_idr; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 20b65f90549e..ce8e952104a7 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -97,7 +97,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns->vvar_page) goto fail_free; - err = ns_common_init(&ns->ns, &timens_operations, true); + err = ns_common_init(ns, &timens_operations); if (err) goto fail_free_page; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index cfb0e28f2779..db9f0463219c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -126,7 +126,7 @@ int create_user_ns(struct cred *new) ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_common_init(&ns->ns, &userns_operations, true); + ret = ns_common_init(ns, &userns_operations); if (ret) goto fail_free; diff --git a/kernel/utsname.c b/kernel/utsname.c index a682830742d3..399888be66bd 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -50,7 +50,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - err = ns_common_init(&ns->ns, &utsns_operations, true); + err = ns_common_init(ns, &utsns_operations); if (err) goto fail_free; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9df236811454..e50897fba8cd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -409,7 +409,7 @@ static __net_init int preinit_net(struct net *net, struct user_namespace *user_n ns_ops = NULL; #endif - ret = ns_common_init(&net->ns, ns_ops, true); + ret = ns_common_init(net, ns_ops); if (ret) return ret; -- cgit v1.2.3 From be5f21d3985f00827e09b798f7a07ebd6dd7f54a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:08 +0200 Subject: ns: add ns_common_free() And drop ns_free_inum(). Anything common that can be wasted centrally should be wasted in the new common helper. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- include/linux/ns_common.h | 3 +++ include/linux/proc_ns.h | 2 -- ipc/namespace.c | 4 ++-- kernel/cgroup/namespace.c | 2 +- kernel/nscommon.c | 5 +++++ kernel/pid_namespace.c | 4 ++-- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 4 ++-- kernel/utsname.c | 2 +- net/core/net_namespace.c | 4 ++-- 11 files changed, 21 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 699b8c770c47..b9f94769ec11 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4082,7 +4082,7 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) - ns_free_inum(&ns->ns); + ns_common_free(ns); dec_mnt_namespaces(ns->ucounts); mnt_ns_tree_remove(ns); } @@ -4154,7 +4154,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); - ns_free_inum(&new_ns->ns); + ns_common_free(ns); dec_mnt_namespaces(new_ns->ucounts); mnt_ns_release(new_ns); return ERR_CAST(new); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 05c7a7dd211b..19833ac547f9 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -41,6 +41,7 @@ struct ns_common { }; int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); +void __ns_common_free(struct ns_common *ns); #define to_ns_common(__ns) \ _Generic((__ns), \ @@ -80,4 +81,6 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, #define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) +#define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) + #endif diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 9f21670b5824..08016f6e0e6f 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -66,8 +66,6 @@ static inline void proc_free_inum(unsigned int inum) {} #endif /* CONFIG_PROC_FS */ -#define ns_free_inum(ns) proc_free_inum((ns)->inum) - #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) #endif /* _LINUX_PROC_NS_H */ diff --git a/ipc/namespace.c b/ipc/namespace.c index 0f8bbd18a475..09d261a1a2aa 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -97,7 +97,7 @@ fail_mq: fail_put: put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kfree(ns); fail_dec: @@ -161,7 +161,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); kfree(ns); } diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index d928c557e28b..16ead7508371 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -40,7 +40,7 @@ void free_cgroup_ns(struct cgroup_namespace *ns) put_css_set(ns->root_cset); dec_cgroup_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c3a90bb665ad..7c1b07e2a6c9 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -18,3 +18,8 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, } return proc_alloc_inum(&ns->inum); } + +void __ns_common_free(struct ns_common *ns) +{ + proc_free_inum(ns->inum); +} diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 170757c265c2..27e2dd9ee051 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -127,7 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns return ns; out_free_inum: - ns_free_inum(&ns->ns); + ns_common_free(ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -152,7 +152,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns) ns_tree_remove(ns); unregister_pidns_sysctls(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index ce8e952104a7..d49c73015d6e 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -255,7 +255,7 @@ void free_time_ns(struct time_namespace *ns) ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); __free_page(ns->vvar_page); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index db9f0463219c..32406bcab526 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -165,7 +165,7 @@ fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: @@ -220,7 +220,7 @@ static void free_user_ns(struct work_struct *work) #endif retire_userns_sysctls(ns); key_free_user_ns(ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); diff --git a/kernel/utsname.c b/kernel/utsname.c index 399888be66bd..95d733eb2c98 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -98,7 +98,7 @@ void free_uts_ns(struct uts_namespace *ns) ns_tree_remove(ns); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e50897fba8cd..a6a3de56a81c 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -590,7 +590,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: - ns_free_inum(&net->ns); + ns_common_free(net); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -713,7 +713,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); - ns_free_inum(&net->ns); + ns_common_free(net); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); -- cgit v1.2.3 From 224ef741ce87aa6474b82e0eb76e0e8e1bafe544 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:46 +0200 Subject: ns: add reference count helpers Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 19833ac547f9..65e258e1fdc6 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -43,16 +43,24 @@ struct ns_common { int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns)->ns, \ - struct ipc_namespace *: &(__ns)->ns, \ - struct mnt_namespace *: &(__ns)->ns, \ - struct net *: &(__ns)->ns, \ - struct pid_namespace *: &(__ns)->ns, \ - struct time_namespace *: &(__ns)->ns, \ - struct user_namespace *: &(__ns)->ns, \ - struct uts_namespace *: &(__ns)->ns) +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + const struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + const struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + const struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + const struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + const struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + const struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + const struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns, \ + const struct uts_namespace *: &(__ns)->ns) #define ns_init_inum(__ns) \ _Generic((__ns), \ @@ -83,4 +91,21 @@ void __ns_common_free(struct ns_common *ns); #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) +{ + return refcount_dec_and_test(&ns->count); +} + +static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) +{ + return refcount_inc_not_zero(&ns->count); +} + +#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->count) +#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->count) +#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) +#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) +#define ns_ref_put_and_lock(__ns, __lock) \ + refcount_dec_and_lock(&to_ns_common((__ns))->count, (__lock)) + #endif -- cgit v1.2.3 From 06099e374f3ab818f0501671b21493ba2e1b94b9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:48 +0200 Subject: cgroup: port to ns_ref_*() helpers Stop accessing ns.count directly. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h index c02bb76c5e32..b7dbf4d623d2 100644 --- a/include/linux/cgroup_namespace.h +++ b/include/linux/cgroup_namespace.h @@ -29,12 +29,12 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, static inline void get_cgroup_ns(struct cgroup_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_cgroup_ns(ns); } -- cgit v1.2.3 From d4825c99d6a738c565d5142ce37369368a4352da Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:49 +0200 Subject: ipc: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/ipc_namespace.h | 4 ++-- ipc/namespace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 924e4754374f..21eff63f47da 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -140,14 +140,14 @@ extern struct ipc_namespace *copy_ipcs(unsigned long flags, static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) { if (ns) { - if (refcount_inc_not_zero(&ns->ns.count)) + if (ns_ref_get(ns)) return ns; } diff --git a/ipc/namespace.c b/ipc/namespace.c index 09d261a1a2aa..bd85d1c9d2c2 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -199,7 +199,7 @@ static void free_ipc(struct work_struct *unused) */ void put_ipc_ns(struct ipc_namespace *ns) { - if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { + if (ns_ref_put_and_lock(ns, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); -- cgit v1.2.3 From 07897b38eadf5a370a6001790239f23036d5b970 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:50 +0200 Subject: pid: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/pid_namespace.h | 2 +- kernel/pid_namespace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index ba0efc8c8596..5b2f29d369c4 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -62,7 +62,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 27e2dd9ee051..162f5fb63d75 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -169,7 +169,7 @@ static void destroy_pid_namespace_work(struct work_struct *work) parent = ns->parent; destroy_pid_namespace(ns); ns = parent; - } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)); + } while (ns != &init_pid_ns && ns_ref_put(ns)); } struct pid_namespace *copy_pid_ns(unsigned long flags, @@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns != &init_pid_ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); -- cgit v1.2.3 From e0c173f1fa02c0b08720aa8aa0cc91c3063146ae Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:51 +0200 Subject: time: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/time_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index a47a4ce4183e..f3b9567cf1f4 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -44,7 +44,7 @@ extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -57,7 +57,7 @@ struct page *find_timens_vvar_page(struct vm_area_struct *vma); static inline void put_time_ns(struct time_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_time_ns(ns); } -- cgit v1.2.3 From 96d997ea5ad1911cc393ffdb5c928b532f2f921a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:52 +0200 Subject: user: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/user_namespace.h | 4 ++-- kernel/user_namespace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index a09056ad090e..9a9aebbf96b9 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -176,7 +176,7 @@ static inline struct user_namespace *to_user_ns(struct ns_common *ns) static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); return ns; } @@ -186,7 +186,7 @@ extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { - if (ns && refcount_dec_and_test(&ns->ns.count)) + if (ns && ns_ref_put(ns)) __put_user_ns(ns); } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 32406bcab526..f9df45c46235 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -225,7 +225,7 @@ static void free_user_ns(struct work_struct *work) kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; - } while (refcount_dec_and_test(&parent->ns.count)); + } while (ns_ref_put(parent)); } void __put_user_ns(struct user_namespace *ns) -- cgit v1.2.3 From 2438b7d63ad866d6b2bb7b8d3455a6365d9b0fbe Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:56 +0200 Subject: uts: port to ns_ref_*() helpers Stop accessing ns.count directly. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/uts_namespace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uts_namespace.h b/include/linux/uts_namespace.h index c2b619bb4e57..23b4f0e1b338 100644 --- a/include/linux/uts_namespace.h +++ b/include/linux/uts_namespace.h @@ -25,7 +25,7 @@ static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) static inline void get_uts_ns(struct uts_namespace *ns) { - refcount_inc(&ns->ns.count); + ns_ref_inc(ns); } extern struct uts_namespace *copy_utsname(unsigned long flags, @@ -34,7 +34,7 @@ extern void free_uts_ns(struct uts_namespace *ns); static inline void put_uts_ns(struct uts_namespace *ns) { - if (refcount_dec_and_test(&ns->ns.count)) + if (ns_ref_put(ns)) free_uts_ns(ns); } -- cgit v1.2.3 From 024596a4e2802e457a9f92af79f246fa9631f8de Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 18 Sep 2025 12:11:59 +0200 Subject: ns: rename to __ns_ref Make it easier to grep and rename to ns_count. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- include/linux/ns_common.h | 12 ++++++------ init/version-timestamp.c | 2 +- ipc/msgutil.c | 2 +- kernel/cgroup/cgroup.c | 2 +- kernel/nscommon.c | 2 +- kernel/pid.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user.c | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 9109069d85cd..740a6ba524d0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -6015,7 +6015,7 @@ struct mnt_namespace init_mnt_ns = { .ns.inum = PROC_MNT_INIT_INO, .ns.ops = &mntns_operations, .user_ns = &init_user_ns, - .ns.count = REFCOUNT_INIT(1), + .ns.__ns_ref = REFCOUNT_INIT(1), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 65e258e1fdc6..aea8528d799a 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -29,7 +29,7 @@ struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; - refcount_t count; + refcount_t __ns_ref; /* do not use directly */ union { struct { u64 ns_id; @@ -93,19 +93,19 @@ void __ns_common_free(struct ns_common *ns); static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - return refcount_dec_and_test(&ns->count); + return refcount_dec_and_test(&ns->__ns_ref); } static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - return refcount_inc_not_zero(&ns->count); + return refcount_inc_not_zero(&ns->__ns_ref); } -#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->count) -#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->count) +#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) +#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) #define ns_ref_put_and_lock(__ns, __lock) \ - refcount_dec_and_lock(&to_ns_common((__ns))->count, (__lock)) + refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) #endif diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 043cbf80a766..547e522e6016 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,7 +8,7 @@ #include struct uts_namespace init_uts_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .name = { .sysname = UTS_SYSNAME, .nodename = UTS_NODENAME, diff --git a/ipc/msgutil.c b/ipc/msgutil.c index bbf61275df41..d0f7dcf4c208 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -27,7 +27,7 @@ DEFINE_SPINLOCK(mq_lock); * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { - .ns.count = REFCOUNT_INIT(1), + .ns.__ns_ref = REFCOUNT_INIT(1), .user_ns = &init_user_ns, .ns.inum = PROC_IPC_INIT_INO, #ifdef CONFIG_IPC_NS diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 092e6bf081ed..a0e24adceef0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -219,7 +219,7 @@ static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_D /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 7c1b07e2a6c9..7aa2be6a0c32 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -5,7 +5,7 @@ int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) { - refcount_set(&ns->count, 1); + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; diff --git a/kernel/pid.c b/kernel/pid.c index c45a28c16cd2..e222426f745d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,7 +71,7 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.count = REFCOUNT_INIT(2), + .ns.__ns_ref = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index d49c73015d6e..d70bdfb7b001 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -480,7 +480,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = PROC_TIME_INIT_INO, .ns.ops = &timens_operations, diff --git a/kernel/user.c b/kernel/user.c index f46b1d41163b..17a742fb4e10 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,7 +65,7 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.count = REFCOUNT_INIT(3), + .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, -- cgit v1.2.3 From daf4c2929fb792d24af0cd7bb6ca1f2949190fa4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:34 -0700 Subject: bpf: bpf_verifier_state->cleaned flag instead of REG_LIVE_DONE Prepare for bpf_reg_state->live field removal by introducing a separate flag to track if clean_verifier_state() had been applied to the state. No functional changes. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-1-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/log.c | 6 ++---- kernel/bpf/verifier.c | 13 ++++--------- 3 files changed, 7 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 020de62bd09c..ac16da8b49dc 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -45,7 +45,6 @@ enum bpf_reg_liveness { REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ - REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; #define ITER_PREFIX "bpf_iter_" @@ -445,6 +444,7 @@ struct bpf_verifier_state { bool speculative; bool in_sleepable; + bool cleaned; /* first and last insn idx of this verifier state */ u32 first_insn_idx; diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index e4983c1303e7..0d6d7bfb2fd0 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -545,14 +545,12 @@ static char slot_type_char[] = { static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) - verbose(env, "_"); + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + verbose(env, "_"); if (live & REG_LIVE_READ) verbose(env, "r"); if (live & REG_LIVE_WRITTEN) verbose(env, "w"); - if (live & REG_LIVE_DONE) - verbose(env, "D"); } #define UNUM_MAX_DECIMAL U16_MAX diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aef6b266f08d..47cec5c8abff 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1758,6 +1758,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return err; dst_state->speculative = src->speculative; dst_state->in_sleepable = src->in_sleepable; + dst_state->cleaned = src->cleaned; dst_state->curframe = src->curframe; dst_state->branches = src->branches; dst_state->parent = src->parent; @@ -3589,11 +3590,6 @@ static int mark_reg_read(struct bpf_verifier_env *env, /* if read wasn't screened by an earlier write ... */ if (writes && state->live & REG_LIVE_WRITTEN) break; - if (verifier_bug_if(parent->live & REG_LIVE_DONE, env, - "type %s var_off %lld off %d", - reg_type_str(env, parent->type), - parent->var_off.value, parent->off)) - return -EFAULT; /* The first condition is more likely to be true than the * second, checked it first. */ @@ -18501,7 +18497,6 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < BPF_REG_FP; i++) { live = st->regs[i].live; /* liveness must not touch this register anymore */ - st->regs[i].live |= REG_LIVE_DONE; if (!(live & REG_LIVE_READ)) /* since the register is unused, clear its state * to make further comparison simpler @@ -18512,7 +18507,6 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { live = st->stack[i].spilled_ptr.live; /* liveness must not touch this stack slot anymore */ - st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; if (!(live & REG_LIVE_READ)) { __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) @@ -18526,6 +18520,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env, { int i; + st->cleaned = true; for (i = 0; i <= st->curframe; i++) clean_func_state(env, st->frame[i]); } @@ -18553,7 +18548,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * their final liveness marks are already propagated. * Hence when the verifier completes the search of state list in is_state_visited() * we can call this clean_live_states() function to mark all liveness states - * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' + * as st->cleaned to indicate that 'parent' pointers of 'struct bpf_reg_state' * will not be used. * This function also clears the registers and stack for states that !READ * to simplify state merging. @@ -18576,7 +18571,7 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) continue; - if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) + if (sl->state.cleaned) /* all regs in this state in all frames were already marked */ continue; if (incomplete_read_marks(env, &sl->state)) -- cgit v1.2.3 From 3b20d3c120bae1e18ee11aa04531b161743db682 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:37 -0700 Subject: bpf: declare a few utility functions as internal api Namely, rename the following functions and add prototypes to bpf_verifier.h: - find_containing_subprog -> bpf_find_containing_subprog - insn_successors -> bpf_insn_successors - calls_callback -> bpf_calls_callback - fmt_stack_mask -> bpf_fmt_stack_mask Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-4-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ kernel/bpf/verifier.c | 34 ++++++++++++++++------------------ 2 files changed, 21 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ac16da8b49dc..93563564bde5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1065,4 +1065,9 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, u32 frameno); +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); +int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); +bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 74a96a0d6c8a..921a5fa06df7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2979,7 +2979,7 @@ static int cmp_subprogs(const void *a, const void *b) } /* Find subprogram that contains instruction at 'off' */ -static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off) +struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *vals = env->subprog_info; int l, r, m; @@ -3004,7 +3004,7 @@ static int find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; - p = find_containing_subprog(env, off); + p = bpf_find_containing_subprog(env, off); if (!p || p->start != off) return -ENOENT; return p - env->subprog_info; @@ -4211,7 +4211,7 @@ static void fmt_reg_mask(char *buf, ssize_t buf_sz, u32 reg_mask) } } /* format stack slots bitmask, e.g., "-8,-24,-40" for 0x15 mask */ -static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) +void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) { DECLARE_BITMAP(mask, 64); bool first = true; @@ -4266,8 +4266,6 @@ static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_histo } } -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); - /* For given verifier state backtrack_insn() is called from the last insn to * the first insn. Its purpose is to compute a bitmask of registers and * stack slots that needs precision in the parent verifier state. @@ -4294,7 +4292,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_reg_mask(bt)); verbose(env, "mark_precise: frame%d: regs=%s ", bt->frame, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); verbose_insn(env, insn); @@ -4495,7 +4493,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * backtracking, as these registers are set by the function * invoking callback. */ - if (subseq_idx >= 0 && calls_callback(env, subseq_idx)) + if (subseq_idx >= 0 && bpf_calls_callback(env, subseq_idx)) for (i = BPF_REG_1; i <= BPF_REG_5; i++) bt_clear_reg(bt, i); if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) { @@ -4934,7 +4932,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, bt_frame_reg_mask(bt, fr)); verbose(env, "mark_precise: frame%d: parent state regs=%s ", fr, env->tmp_str_buf); - fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, + bpf_fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_frame_stack_mask(bt, fr)); verbose(env, "stack=%s: ", env->tmp_str_buf); print_verifier_state(env, st, fr, true); @@ -11023,7 +11021,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) "At callback return", "R0"); return -EINVAL; } - if (!calls_callback(env, callee->callsite)) { + if (!bpf_calls_callback(env, callee->callsite)) { verifier_bug(env, "in callback at %d, callsite %d !calls_callback", *insn_idx, callee->callsite); return -EFAULT; @@ -17298,7 +17296,7 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *subprog; - subprog = find_containing_subprog(env, off); + subprog = bpf_find_containing_subprog(env, off); subprog->changes_pkt_data = true; } @@ -17306,7 +17304,7 @@ static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *subprog; - subprog = find_containing_subprog(env, off); + subprog = bpf_find_containing_subprog(env, off); subprog->might_sleep = true; } @@ -17320,8 +17318,8 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) { struct bpf_subprog_info *caller, *callee; - caller = find_containing_subprog(env, t); - callee = find_containing_subprog(env, w); + caller = bpf_find_containing_subprog(env, t); + callee = bpf_find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; caller->might_sleep |= callee->might_sleep; } @@ -17391,7 +17389,7 @@ static void mark_calls_callback(struct bpf_verifier_env *env, int idx) env->insn_aux_data[idx].calls_callback = true; } -static bool calls_callback(struct bpf_verifier_env *env, int insn_idx) +bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx) { return env->insn_aux_data[insn_idx].calls_callback; } @@ -19439,7 +19437,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) goto hit; } } - if (calls_callback(env, insn_idx)) { + if (bpf_calls_callback(env, insn_idx)) { if (states_equal(env, &sl->state, cur, RANGE_WITHIN)) goto hit; goto skip_inf_loop_check; @@ -24171,7 +24169,7 @@ static bool can_jump(struct bpf_insn *insn) return false; } -static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) { struct bpf_insn *insn = &prog->insnsi[idx]; int i = 0, insn_sz; @@ -24387,7 +24385,7 @@ static int compute_live_registers(struct bpf_verifier_env *env) u16 new_out = 0; u16 new_in = 0; - succ_num = insn_successors(env->prog, insn_idx, succ); + succ_num = bpf_insn_successors(env->prog, insn_idx, succ); for (int s = 0; s < succ_num; ++s) new_out |= state[succ[s]].in; new_in = (new_out & ~live->def) | live->use; @@ -24556,7 +24554,7 @@ dfs_continue: stack[stack_sz++] = w; } /* Visit 'w' successors */ - succ_cnt = insn_successors(env->prog, w, succ); + succ_cnt = bpf_insn_successors(env->prog, w, succ); for (j = 0; j < succ_cnt; ++j) { if (pre[succ[j]]) { low[w] = min(low[w], low[succ[j]]); -- cgit v1.2.3 From efcda22aa541bbda827e54302baf9ae4fd44cdf2 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:38 -0700 Subject: bpf: compute instructions postorder per subprogram The next patch would require doing postorder traversal of individual subprograms. Facilitate this by moving env->cfg.insn_postorder computation from check_cfg() to a separate pass, as check_cfg() descends into called subprograms (and it needs to, because of merge_callee_effects() logic). env->cfg.insn_postorder is used only by compute_live_registers(), this function does not track cross subprogram dependencies, thus the change does not affect it's operation. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-5-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++- kernel/bpf/verifier.c | 68 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 60 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 93563564bde5..bd87e80f9423 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -665,6 +665,7 @@ struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ + u32 postorder_start; /* The idx to the env->cfg.insn_postorder */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; /* offsets in range [stack_depth .. fastcall_stack_off) @@ -794,7 +795,10 @@ struct bpf_verifier_env { struct { int *insn_state; int *insn_stack; - /* vector of instruction indexes sorted in post-order */ + /* + * vector of instruction indexes sorted in post-order, grouped by subprogram, + * see bpf_subprog_info->postorder_start. + */ int *insn_postorder; int cur_stack; /* current position in the insn_postorder vector */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 921a5fa06df7..dc8d26dc9bf1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17863,7 +17863,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state, *insn_postorder; + int *insn_stack, *insn_state; int ex_insn_beg, i, ret = 0; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); @@ -17876,14 +17876,6 @@ static int check_cfg(struct bpf_verifier_env *env) return -ENOMEM; } - insn_postorder = env->cfg.insn_postorder = - kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); - if (!insn_postorder) { - kvfree(insn_state); - kvfree(insn_stack); - return -ENOMEM; - } - ex_insn_beg = env->exception_callback_subprog ? env->subprog_info[env->exception_callback_subprog].start : 0; @@ -17901,7 +17893,6 @@ walk_cfg: case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; - insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; @@ -17955,6 +17946,56 @@ err_free: return ret; } +/* + * For each subprogram 'i' fill array env->cfg.insn_subprogram sub-range + * [env->subprog_info[i].postorder_start, env->subprog_info[i+1].postorder_start) + * with indices of 'i' instructions in postorder. + */ +static int compute_postorder(struct bpf_verifier_env *env) +{ + u32 cur_postorder, i, top, stack_sz, s, succ_cnt, succ[2]; + int *stack = NULL, *postorder = NULL, *state = NULL; + + postorder = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + state = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + stack = kvcalloc(env->prog->len, sizeof(int), GFP_KERNEL_ACCOUNT); + if (!postorder || !state || !stack) { + kvfree(postorder); + kvfree(state); + kvfree(stack); + return -ENOMEM; + } + cur_postorder = 0; + for (i = 0; i < env->subprog_cnt; i++) { + env->subprog_info[i].postorder_start = cur_postorder; + stack[0] = env->subprog_info[i].start; + stack_sz = 1; + do { + top = stack[stack_sz - 1]; + state[top] |= DISCOVERED; + if (state[top] & EXPLORED) { + postorder[cur_postorder++] = top; + stack_sz--; + continue; + } + succ_cnt = bpf_insn_successors(env->prog, top, succ); + for (s = 0; s < succ_cnt; ++s) { + if (!state[succ[s]]) { + stack[stack_sz++] = succ[s]; + state[succ[s]] |= DISCOVERED; + } + } + state[top] |= EXPLORED; + } while (stack_sz); + } + env->subprog_info[i].postorder_start = cur_postorder; + env->cfg.insn_postorder = postorder; + env->cfg.cur_postorder = cur_postorder; + kvfree(stack); + kvfree(state); + return 0; +} + static int check_abnormal_return(struct bpf_verifier_env *env) { int i; @@ -24422,9 +24463,6 @@ static int compute_live_registers(struct bpf_verifier_env *env) out: kvfree(state); - kvfree(env->cfg.insn_postorder); - env->cfg.insn_postorder = NULL; - env->cfg.cur_postorder = 0; return err; } @@ -24727,6 +24765,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = compute_postorder(env); + if (ret < 0) + goto skip_full_check; + ret = check_attach_btf_id(env); if (ret) goto skip_full_check; -- cgit v1.2.3 From b3698c356ad92bcdb9920655bc9df02a2a8946f9 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:39 -0700 Subject: bpf: callchain sensitive stack liveness tracking using CFG This commit adds a flow-sensitive, context-sensitive, path-insensitive data flow analysis for live stack slots: - flow-sensitive: uses program control flow graph to compute data flow values; - context-sensitive: collects data flow values for each possible call chain in a program; - path-insensitive: does not distinguish between separate control flow graph paths reaching the same instruction. Compared to the current path-sensitive analysis, this approach trades some precision for not having to enumerate every path in the program. This gives a theoretical capability to run the analysis before main verification pass. See cover letter for motivation. The basic idea is as follows: - Data flow values indicate stack slots that might be read and stack slots that are definitely written. - Data flow values are collected for each (call chain, instruction number) combination in the program. - Within a subprogram, data flow values are propagated using control flow graph. - Data flow values are transferred from entry instructions of callee subprograms to call sites in caller subprograms. In other words, a tree of all possible call chains is constructed. Each node of this tree represents a subprogram. Read and write marks are collected for each instruction of each node. Live stack slots are first computed for lower level nodes. Then, information about outer stack slots that might be read or are definitely written by a subprogram is propagated one level up, to the corresponding call instructions of the upper nodes. Procedure repeats until root node is processed. In the absence of value range analysis, stack read/write marks are collected during main verification pass, and data flow computation is triggered each time verifier.c:states_equal() needs to query the information. Implementation details are documented in kernel/bpf/liveness.c. Quantitative data about verification performance changes and memory consumption is in the cover letter. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-6-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 14 + kernel/bpf/Makefile | 2 +- kernel/bpf/liveness.c | 677 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 692 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/liveness.c (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bd87e80f9423..2e3bdd50e2ba 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -745,6 +745,8 @@ struct bpf_scc_info { struct bpf_scc_visit visits[]; }; +struct bpf_liveness; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -846,6 +848,7 @@ struct bpf_verifier_env { struct bpf_insn insn_buf[INSN_BUF_SIZE]; struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; struct bpf_scc_callchain callchain_buf; + struct bpf_liveness *liveness; /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; @@ -1074,4 +1077,15 @@ int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); +int bpf_stack_liveness_init(struct bpf_verifier_env *env); +void bpf_stack_liveness_free(struct bpf_verifier_env *env); +int bpf_update_live_stack(struct bpf_verifier_env *env); +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frameno, u32 insn_idx, u64 mask); +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frameno, u64 mask); +int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx); +int bpf_commit_stack_write_marks(struct bpf_verifier_env *env); +int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st); +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi); +void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env); + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f6cf8c2af5f7..7fd0badfacb1 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse endif CFLAGS_core.o += -Wno-override-init $(cflags-nogcse-yy) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o liveness.o obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c new file mode 100644 index 000000000000..6f9dfaaf6e64 --- /dev/null +++ b/kernel/bpf/liveness.c @@ -0,0 +1,677 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include + +/* + * This file implements live stack slots analysis. After accumulating + * stack usage data, the analysis answers queries about whether a + * particular stack slot may be read by an instruction or any of it's + * successors. This data is consumed by the verifier states caching + * mechanism to decide which stack slots are important when looking for a + * visited state corresponding to the current state. + * + * The analysis is call chain sensitive, meaning that data is collected + * and queried for tuples (call chain, subprogram instruction index). + * Such sensitivity allows identifying if some subprogram call always + * leads to writes in the caller's stack. + * + * The basic idea is as follows: + * - As the verifier accumulates a set of visited states, the analysis instance + * accumulates a conservative estimate of stack slots that can be read + * or must be written for each visited tuple (call chain, instruction index). + * - If several states happen to visit the same instruction with the same + * call chain, stack usage information for the corresponding tuple is joined: + * - "may_read" set represents a union of all possibly read slots + * (any slot in "may_read" set might be read at or after the instruction); + * - "must_write" set represents an intersection of all possibly written slots + * (any slot in "must_write" set is guaranteed to be written by the instruction). + * - The analysis is split into two phases: + * - read and write marks accumulation; + * - read and write marks propagation. + * - The propagation phase is a textbook live variable data flow analysis: + * + * state[cc, i].live_after = U [state[cc, s].live_before for s in insn_successors(i)] + * state[cc, i].live_before = + * (state[cc, i].live_after / state[cc, i].must_write) U state[i].may_read + * + * Where: + * - `U` stands for set union + * - `/` stands for set difference; + * - `cc` stands for a call chain; + * - `i` and `s` are instruction indexes; + * + * The above equations are computed for each call chain and instruction + * index until state stops changing. + * - Additionally, in order to transfer "must_write" information from a + * subprogram to call instructions invoking this subprogram, + * the "must_write_acc" set is tracked for each (cc, i) tuple. + * A set of stack slots that are guaranteed to be written by this + * instruction or any of its successors (within the subprogram). + * The equation for "must_write_acc" propagation looks as follows: + * + * state[cc, i].must_write_acc = + * ∩ [state[cc, s].must_write_acc for s in insn_successors(i)] + * U state[cc, i].must_write + * + * (An intersection of all "must_write_acc" for instruction successors + * plus all "must_write" slots for the instruction itself). + * - After the propagation phase completes for a subprogram, information from + * (cc, 0) tuple (subprogram entry) is transferred to the caller's call chain: + * - "must_write_acc" set is intersected with the call site's "must_write" set; + * - "may_read" set is added to the call site's "may_read" set. + * - Any live stack queries must be taken after the propagation phase. + * - Accumulation and propagation phases can be entered multiple times, + * at any point in time: + * - "may_read" set only grows; + * - "must_write" set only shrinks; + * - for each visited verifier state with zero branches, all relevant + * read and write marks are already recorded by the analysis instance. + * + * Technically, the analysis is facilitated by the following data structures: + * - Call chain: for given verifier state, the call chain is a tuple of call + * instruction indexes leading to the current subprogram plus the subprogram + * entry point index. + * - Function instance: for a given call chain, for each instruction in + * the current subprogram, a mapping between instruction index and a + * set of "may_read", "must_write" and other marks accumulated for this + * instruction. + * - A hash table mapping call chains to function instances. + */ + +struct callchain { + u32 callsites[MAX_CALL_FRAMES]; /* instruction pointer for each frame */ + /* cached subprog_info[*].start for functions owning the frames: + * - sp_starts[curframe] used to get insn relative index within current function; + * - sp_starts[0..current-1] used for fast callchain_frame_up(). + */ + u32 sp_starts[MAX_CALL_FRAMES]; + u32 curframe; /* depth of callsites and sp_starts arrays */ +}; + +struct per_frame_masks { + u64 may_read; /* stack slots that may be read by this instruction */ + u64 must_write; /* stack slots written by this instruction */ + u64 must_write_acc; /* stack slots written by this instruction and its successors */ + u64 live_before; /* stack slots that may be read by this insn and its successors */ +}; + +/* + * A function instance created for a specific callchain. + * Encapsulates read and write marks for each instruction in the function. + * Marks are tracked for each frame in the callchain. + */ +struct func_instance { + struct hlist_node hl_node; + struct callchain callchain; + u32 insn_cnt; /* cached number of insns in the function */ + bool updated; + bool must_write_dropped; + /* Per frame, per instruction masks, frames allocated lazily. */ + struct per_frame_masks *frames[MAX_CALL_FRAMES]; + /* For each instruction a flag telling if "must_write" had been initialized for it. */ + bool *must_write_set; +}; + +struct live_stack_query { + struct func_instance *instances[MAX_CALL_FRAMES]; /* valid in range [0..curframe] */ + u32 curframe; + u32 insn_idx; +}; + +struct bpf_liveness { + DECLARE_HASHTABLE(func_instances, 8); /* maps callchain to func_instance */ + struct live_stack_query live_stack_query; /* cache to avoid repetitive ht lookups */ + /* Cached instance corresponding to env->cur_state, avoids per-instruction ht lookup */ + struct func_instance *cur_instance; + /* + * Below fields are used to accumulate stack write marks for instruction at + * @write_insn_idx before submitting the marks to @cur_instance. + */ + u64 write_masks_acc[MAX_CALL_FRAMES]; + u32 write_insn_idx; +}; + +/* Compute callchain corresponding to state @st at depth @frameno */ +static void compute_callchain(struct bpf_verifier_env *env, struct bpf_verifier_state *st, + struct callchain *callchain, u32 frameno) +{ + struct bpf_subprog_info *subprog_info = env->subprog_info; + u32 i; + + memset(callchain, 0, sizeof(*callchain)); + for (i = 0; i <= frameno; i++) { + callchain->sp_starts[i] = subprog_info[st->frame[i]->subprogno].start; + if (i < st->curframe) + callchain->callsites[i] = st->frame[i + 1]->callsite; + } + callchain->curframe = frameno; + callchain->callsites[callchain->curframe] = callchain->sp_starts[callchain->curframe]; +} + +static u32 hash_callchain(struct callchain *callchain) +{ + return jhash2(callchain->callsites, callchain->curframe, 0); +} + +static bool same_callsites(struct callchain *a, struct callchain *b) +{ + int i; + + if (a->curframe != b->curframe) + return false; + for (i = a->curframe; i >= 0; i--) + if (a->callsites[i] != b->callsites[i]) + return false; + return true; +} + +/* + * Find existing or allocate new function instance corresponding to @callchain. + * Instances are accumulated in env->liveness->func_instances and persist + * until the end of the verification process. + */ +static struct func_instance *__lookup_instance(struct bpf_verifier_env *env, + struct callchain *callchain) +{ + struct bpf_liveness *liveness = env->liveness; + struct bpf_subprog_info *subprog; + struct func_instance *result; + u32 subprog_sz, size, key; + + key = hash_callchain(callchain); + hash_for_each_possible(liveness->func_instances, result, hl_node, key) + if (same_callsites(&result->callchain, callchain)) + return result; + + subprog = bpf_find_containing_subprog(env, callchain->sp_starts[callchain->curframe]); + subprog_sz = (subprog + 1)->start - subprog->start; + size = sizeof(struct func_instance); + result = kvzalloc(size, GFP_KERNEL_ACCOUNT); + if (!result) + return ERR_PTR(-ENOMEM); + result->must_write_set = kvcalloc(subprog_sz, sizeof(*result->must_write_set), + GFP_KERNEL_ACCOUNT); + if (!result->must_write_set) + return ERR_PTR(-ENOMEM); + memcpy(&result->callchain, callchain, sizeof(*callchain)); + result->insn_cnt = subprog_sz; + hash_add(liveness->func_instances, &result->hl_node, key); + return result; +} + +static struct func_instance *lookup_instance(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + u32 frameno) +{ + struct callchain callchain; + + compute_callchain(env, st, &callchain, frameno); + return __lookup_instance(env, &callchain); +} + +int bpf_stack_liveness_init(struct bpf_verifier_env *env) +{ + env->liveness = kvzalloc(sizeof(*env->liveness), GFP_KERNEL_ACCOUNT); + if (!env->liveness) + return -ENOMEM; + hash_init(env->liveness->func_instances); + return 0; +} + +void bpf_stack_liveness_free(struct bpf_verifier_env *env) +{ + struct func_instance *instance; + struct hlist_node *tmp; + int bkt, i; + + if (!env->liveness) + return; + hash_for_each_safe(env->liveness->func_instances, bkt, tmp, instance, hl_node) { + for (i = 0; i <= instance->callchain.curframe; i++) + kvfree(instance->frames[i]); + kvfree(instance->must_write_set); + kvfree(instance); + } + kvfree(env->liveness); +} + +/* + * Convert absolute instruction index @insn_idx to an index relative + * to start of the function corresponding to @instance. + */ +static int relative_idx(struct func_instance *instance, u32 insn_idx) +{ + return insn_idx - instance->callchain.sp_starts[instance->callchain.curframe]; +} + +static struct per_frame_masks *get_frame_masks(struct func_instance *instance, + u32 frame, u32 insn_idx) +{ + if (!instance->frames[frame]) + return NULL; + + return &instance->frames[frame][relative_idx(instance, insn_idx)]; +} + +static struct per_frame_masks *alloc_frame_masks(struct bpf_verifier_env *env, + struct func_instance *instance, + u32 frame, u32 insn_idx) +{ + struct per_frame_masks *arr; + + if (!instance->frames[frame]) { + arr = kvcalloc(instance->insn_cnt, sizeof(*arr), GFP_KERNEL_ACCOUNT); + instance->frames[frame] = arr; + if (!arr) + return ERR_PTR(-ENOMEM); + } + return get_frame_masks(instance, frame, insn_idx); +} + +void bpf_reset_live_stack_callchain(struct bpf_verifier_env *env) +{ + env->liveness->cur_instance = NULL; +} + +/* If @env->liveness->cur_instance is null, set it to instance corresponding to @env->cur_state. */ +static int ensure_cur_instance(struct bpf_verifier_env *env) +{ + struct bpf_liveness *liveness = env->liveness; + struct func_instance *instance; + + if (liveness->cur_instance) + return 0; + + instance = lookup_instance(env, env->cur_state, env->cur_state->curframe); + if (IS_ERR(instance)) + return PTR_ERR(instance); + + liveness->cur_instance = instance; + return 0; +} + +/* Accumulate may_read masks for @frame at @insn_idx */ +static int mark_stack_read(struct bpf_verifier_env *env, + struct func_instance *instance, u32 frame, u32 insn_idx, u64 mask) +{ + struct per_frame_masks *masks; + u64 new_may_read; + + masks = alloc_frame_masks(env, instance, frame, insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + new_may_read = masks->may_read | mask; + if (new_may_read != masks->may_read && + ((new_may_read | masks->live_before) != masks->live_before)) + instance->updated = true; + masks->may_read |= mask; + return 0; +} + +int bpf_mark_stack_read(struct bpf_verifier_env *env, u32 frame, u32 insn_idx, u64 mask) +{ + int err; + + err = ensure_cur_instance(env); + err = err ?: mark_stack_read(env, env->liveness->cur_instance, frame, insn_idx, mask); + return err; +} + +static void reset_stack_write_marks(struct bpf_verifier_env *env, + struct func_instance *instance, u32 insn_idx) +{ + struct bpf_liveness *liveness = env->liveness; + int i; + + liveness->write_insn_idx = insn_idx; + for (i = 0; i <= instance->callchain.curframe; i++) + liveness->write_masks_acc[i] = 0; +} + +int bpf_reset_stack_write_marks(struct bpf_verifier_env *env, u32 insn_idx) +{ + struct bpf_liveness *liveness = env->liveness; + int err; + + err = ensure_cur_instance(env); + if (err) + return err; + + reset_stack_write_marks(env, liveness->cur_instance, insn_idx); + return 0; +} + +void bpf_mark_stack_write(struct bpf_verifier_env *env, u32 frame, u64 mask) +{ + env->liveness->write_masks_acc[frame] |= mask; +} + +static int commit_stack_write_marks(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct bpf_liveness *liveness = env->liveness; + u32 idx, frame, curframe, old_must_write; + struct per_frame_masks *masks; + u64 mask; + + if (!instance) + return 0; + + curframe = instance->callchain.curframe; + idx = relative_idx(instance, liveness->write_insn_idx); + for (frame = 0; frame <= curframe; frame++) { + mask = liveness->write_masks_acc[frame]; + /* avoid allocating frames for zero masks */ + if (mask == 0 && !instance->must_write_set[idx]) + continue; + masks = alloc_frame_masks(env, instance, frame, liveness->write_insn_idx); + if (IS_ERR(masks)) + return PTR_ERR(masks); + old_must_write = masks->must_write; + /* + * If instruction at this callchain is seen for a first time, set must_write equal + * to @mask. Otherwise take intersection with the previous value. + */ + if (instance->must_write_set[idx]) + mask &= old_must_write; + if (old_must_write != mask) { + masks->must_write = mask; + instance->updated = true; + } + if (old_must_write & ~mask) + instance->must_write_dropped = true; + } + instance->must_write_set[idx] = true; + liveness->write_insn_idx = 0; + return 0; +} + +/* + * Merge stack writes marks in @env->liveness->write_masks_acc + * with information already in @env->liveness->cur_instance. + */ +int bpf_commit_stack_write_marks(struct bpf_verifier_env *env) +{ + return commit_stack_write_marks(env, env->liveness->cur_instance); +} + +static char *fmt_callchain(struct bpf_verifier_env *env, struct callchain *callchain) +{ + char *buf_end = env->tmp_str_buf + sizeof(env->tmp_str_buf); + char *buf = env->tmp_str_buf; + int i; + + buf += snprintf(buf, buf_end - buf, "("); + for (i = 0; i <= callchain->curframe; i++) + buf += snprintf(buf, buf_end - buf, "%s%d", i ? "," : "", callchain->callsites[i]); + snprintf(buf, buf_end - buf, ")"); + return env->tmp_str_buf; +} + +static void log_mask_change(struct bpf_verifier_env *env, struct callchain *callchain, + char *pfx, u32 frame, u32 insn_idx, u64 old, u64 new) +{ + u64 changed_bits = old ^ new; + u64 new_ones = new & changed_bits; + u64 new_zeros = ~new & changed_bits; + + if (!changed_bits) + return; + bpf_log(&env->log, "%s frame %d insn %d ", fmt_callchain(env, callchain), frame, insn_idx); + if (new_ones) { + bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_ones); + bpf_log(&env->log, "+%s %s ", pfx, env->tmp_str_buf); + } + if (new_zeros) { + bpf_fmt_stack_mask(env->tmp_str_buf, sizeof(env->tmp_str_buf), new_zeros); + bpf_log(&env->log, "-%s %s", pfx, env->tmp_str_buf); + } + bpf_log(&env->log, "\n"); +} + +static struct func_instance *get_outer_instance(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct callchain callchain = instance->callchain; + + /* Adjust @callchain to represent callchain one frame up */ + callchain.callsites[callchain.curframe] = 0; + callchain.sp_starts[callchain.curframe] = 0; + callchain.curframe--; + callchain.callsites[callchain.curframe] = callchain.sp_starts[callchain.curframe]; + return __lookup_instance(env, &callchain); +} + +static u32 callchain_subprog_start(struct callchain *callchain) +{ + return callchain->sp_starts[callchain->curframe]; +} + +/* + * Transfer @may_read and @must_write_acc marks from the first instruction of @instance, + * to the call instruction in function instance calling @instance. + */ +static int propagate_to_outer_instance(struct bpf_verifier_env *env, + struct func_instance *instance) +{ + struct callchain *callchain = &instance->callchain; + u32 this_subprog_start, callsite, frame; + struct func_instance *outer_instance; + struct per_frame_masks *insn; + int err; + + this_subprog_start = callchain_subprog_start(callchain); + outer_instance = get_outer_instance(env, instance); + callsite = callchain->callsites[callchain->curframe - 1]; + + reset_stack_write_marks(env, outer_instance, callsite); + for (frame = 0; frame < callchain->curframe; frame++) { + insn = get_frame_masks(instance, frame, this_subprog_start); + if (!insn) + continue; + bpf_mark_stack_write(env, frame, insn->must_write_acc); + err = mark_stack_read(env, outer_instance, frame, callsite, insn->live_before); + if (err) + return err; + } + commit_stack_write_marks(env, outer_instance); + return 0; +} + +static inline bool update_insn(struct bpf_verifier_env *env, + struct func_instance *instance, u32 frame, u32 insn_idx) +{ + struct bpf_insn_aux_data *aux = env->insn_aux_data; + u64 new_before, new_after, must_write_acc; + struct per_frame_masks *insn, *succ_insn; + u32 succ_num, s, succ[2]; + bool changed; + + succ_num = bpf_insn_successors(env->prog, insn_idx, succ); + if (unlikely(succ_num == 0)) + return false; + + changed = false; + insn = get_frame_masks(instance, frame, insn_idx); + new_before = 0; + new_after = 0; + /* + * New "must_write_acc" is an intersection of all "must_write_acc" + * of successors plus all "must_write" slots of instruction itself. + */ + must_write_acc = U64_MAX; + for (s = 0; s < succ_num; ++s) { + succ_insn = get_frame_masks(instance, frame, succ[s]); + new_after |= succ_insn->live_before; + must_write_acc &= succ_insn->must_write_acc; + } + must_write_acc |= insn->must_write; + /* + * New "live_before" is a union of all "live_before" of successors + * minus slots written by instruction plus slots read by instruction. + */ + new_before = (new_after & ~insn->must_write) | insn->may_read; + changed |= new_before != insn->live_before; + changed |= must_write_acc != insn->must_write_acc; + if (unlikely(env->log.level & BPF_LOG_LEVEL2) && + (insn->may_read || insn->must_write || + insn_idx == callchain_subprog_start(&instance->callchain) || + aux[insn_idx].prune_point)) { + log_mask_change(env, &instance->callchain, "live", + frame, insn_idx, insn->live_before, new_before); + log_mask_change(env, &instance->callchain, "written", + frame, insn_idx, insn->must_write_acc, must_write_acc); + } + insn->live_before = new_before; + insn->must_write_acc = must_write_acc; + return changed; +} + +/* Fixed-point computation of @live_before and @must_write_acc marks */ +static int update_instance(struct bpf_verifier_env *env, struct func_instance *instance) +{ + u32 i, frame, po_start, po_end, cnt, this_subprog_start; + struct callchain *callchain = &instance->callchain; + int *insn_postorder = env->cfg.insn_postorder; + struct bpf_subprog_info *subprog; + struct per_frame_masks *insn; + bool changed; + int err; + + this_subprog_start = callchain_subprog_start(callchain); + /* + * If must_write marks were updated must_write_acc needs to be reset + * (to account for the case when new must_write sets became smaller). + */ + if (instance->must_write_dropped) { + for (frame = 0; frame <= callchain->curframe; frame++) { + if (!instance->frames[frame]) + continue; + + for (i = 0; i < instance->insn_cnt; i++) { + insn = get_frame_masks(instance, frame, this_subprog_start + i); + insn->must_write_acc = 0; + } + } + } + + subprog = bpf_find_containing_subprog(env, this_subprog_start); + po_start = subprog->postorder_start; + po_end = (subprog + 1)->postorder_start; + cnt = 0; + /* repeat until fixed point is reached */ + do { + cnt++; + changed = false; + for (frame = 0; frame <= instance->callchain.curframe; frame++) { + if (!instance->frames[frame]) + continue; + + for (i = po_start; i < po_end; i++) + changed |= update_insn(env, instance, frame, insn_postorder[i]); + } + } while (changed); + + if (env->log.level & BPF_LOG_LEVEL2) + bpf_log(&env->log, "%s live stack update done in %d iterations\n", + fmt_callchain(env, callchain), cnt); + + /* transfer marks accumulated for outer frames to outer func instance (caller) */ + if (callchain->curframe > 0) { + err = propagate_to_outer_instance(env, instance); + if (err) + return err; + } + + return 0; +} + +/* + * Prepare all callchains within @env->cur_state for querying. + * This function should be called after each verifier.c:pop_stack() + * and whenever verifier.c:do_check_insn() processes subprogram exit. + * This would guarantee that visited verifier states with zero branches + * have their bpf_mark_stack_{read,write}() effects propagated in + * @env->liveness. + */ +int bpf_update_live_stack(struct bpf_verifier_env *env) +{ + struct func_instance *instance; + int err, frame; + + bpf_reset_live_stack_callchain(env); + for (frame = env->cur_state->curframe; frame >= 0; --frame) { + instance = lookup_instance(env, env->cur_state, frame); + if (IS_ERR(instance)) + return PTR_ERR(instance); + + if (instance->updated) { + err = update_instance(env, instance); + if (err) + return err; + instance->updated = false; + instance->must_write_dropped = false; + } + } + return 0; +} + +static bool is_live_before(struct func_instance *instance, u32 insn_idx, u32 frameno, u32 spi) +{ + struct per_frame_masks *masks; + + masks = get_frame_masks(instance, frameno, insn_idx); + return masks && (masks->live_before & BIT(spi)); +} + +int bpf_live_stack_query_init(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct live_stack_query *q = &env->liveness->live_stack_query; + struct func_instance *instance; + u32 frame; + + memset(q, 0, sizeof(*q)); + for (frame = 0; frame <= st->curframe; frame++) { + instance = lookup_instance(env, st, frame); + if (IS_ERR(instance)) + return PTR_ERR(instance); + q->instances[frame] = instance; + } + q->curframe = st->curframe; + q->insn_idx = st->insn_idx; + return 0; +} + +bool bpf_stack_slot_alive(struct bpf_verifier_env *env, u32 frameno, u32 spi) +{ + /* + * Slot is alive if it is read before q->st->insn_idx in current func instance, + * or if for some outer func instance: + * - alive before callsite if callsite calls callback, otherwise + * - alive after callsite + */ + struct live_stack_query *q = &env->liveness->live_stack_query; + struct func_instance *instance, *curframe_instance; + u32 i, callsite; + bool alive; + + curframe_instance = q->instances[q->curframe]; + if (is_live_before(curframe_instance, q->insn_idx, frameno, spi)) + return true; + + for (i = frameno; i < q->curframe; i++) { + callsite = curframe_instance->callchain.callsites[i]; + instance = q->instances[i]; + alive = bpf_calls_callback(env, callsite) + ? is_live_before(instance, callsite, frameno, spi) + : is_live_before(instance, callsite + 1, frameno, spi); + if (alive) + return true; + } + + return false; +} -- cgit v1.2.3 From ccf25a67c7e29cfa6815d193054789b45ef825ad Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:41 -0700 Subject: bpf: signal error if old liveness is more conservative than new Unlike the new algorithm, register chain based liveness tracking is fully path sensitive, and thus should be strictly more accurate. Validate the new algorithm by signaling an error whenever it considers a stack slot dead while the old algorithm considers it alive. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-8-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2e3bdd50e2ba..dec5da3a2e59 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -852,6 +852,7 @@ struct bpf_verifier_env { /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; + bool internal_error; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bb931a144b95..f70e34a38c13 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18576,6 +18576,11 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { if (!bpf_stack_slot_alive(env, st->frameno, i)) { + if (st->stack[i].spilled_ptr.live & REG_LIVE_READ) { + verifier_bug(env, "incorrect live marks #1 for insn %d frameno %d spi %d\n", + env->insn_idx, st->frameno, i); + env->internal_error = true; + } __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; @@ -19546,6 +19551,8 @@ skip_inf_loop_check: loop = incomplete_read_marks(env, &sl->state); if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: + if (env->internal_error) + return -EFAULT; sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. @@ -19660,6 +19667,8 @@ hit: return 1; } miss: + if (env->internal_error) + return -EFAULT; /* when new state is not going to be added do not increase miss count. * Otherwise several loop iterations will remove the state * recorded earlier. The goal of these heuristics is to have -- cgit v1.2.3 From 107e169799057bc6a379ddb625cbe1e51cfc7d72 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:42 -0700 Subject: bpf: disable and remove registers chain based liveness Remove register chain based liveness tracking: - struct bpf_reg_state->{parent,live} fields are no longer needed; - REG_LIVE_WRITTEN marks are superseded by bpf_mark_stack_write() calls; - mark_reg_read() calls are superseded by bpf_mark_stack_read(); - log.c:print_liveness() is superseded by logging in liveness.c; - propagate_liveness() is superseded by bpf_update_live_stack(); - no need to establish register chains in is_state_visited() anymore; - fix a bunch of tests expecting "_w" suffixes in verifier log messages. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-9-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/verifier.rst | 264 ----------------- include/linux/bpf_verifier.h | 25 -- kernel/bpf/log.c | 26 +- kernel/bpf/verifier.c | 315 ++------------------- tools/testing/selftests/bpf/prog_tests/align.c | 178 ++++++------ tools/testing/selftests/bpf/prog_tests/spin_lock.c | 12 +- .../selftests/bpf/prog_tests/test_veristat.c | 44 +-- .../selftests/bpf/progs/exceptions_assert.c | 34 +-- .../selftests/bpf/progs/iters_state_safety.c | 4 +- .../selftests/bpf/progs/iters_testmod_seq.c | 6 +- .../selftests/bpf/progs/mem_rdonly_untrusted.c | 4 +- .../testing/selftests/bpf/progs/verifier_bounds.c | 38 +-- .../selftests/bpf/progs/verifier_global_ptr_args.c | 4 +- tools/testing/selftests/bpf/progs/verifier_ldsx.c | 2 +- .../selftests/bpf/progs/verifier_precision.c | 16 +- .../selftests/bpf/progs/verifier_scalar_ids.c | 10 +- .../selftests/bpf/progs/verifier_spill_fill.c | 40 +-- .../bpf/progs/verifier_subprog_precision.c | 6 +- tools/testing/selftests/bpf/verifier/bpf_st_mem.c | 4 +- 19 files changed, 226 insertions(+), 806 deletions(-) (limited to 'include/linux') diff --git a/Documentation/bpf/verifier.rst b/Documentation/bpf/verifier.rst index 95e6f80a407e..510d15bc697b 100644 --- a/Documentation/bpf/verifier.rst +++ b/Documentation/bpf/verifier.rst @@ -347,270 +347,6 @@ However, only the value of register ``r1`` is important to successfully finish verification. The goal of the liveness tracking algorithm is to spot this fact and figure out that both states are actually equivalent. -Data structures -~~~~~~~~~~~~~~~ - -Liveness is tracked using the following data structures:: - - enum bpf_reg_liveness { - REG_LIVE_NONE = 0, - REG_LIVE_READ32 = 0x1, - REG_LIVE_READ64 = 0x2, - REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, - REG_LIVE_WRITTEN = 0x4, - REG_LIVE_DONE = 0x8, - }; - - struct bpf_reg_state { - ... - struct bpf_reg_state *parent; - ... - enum bpf_reg_liveness live; - ... - }; - - struct bpf_stack_state { - struct bpf_reg_state spilled_ptr; - ... - }; - - struct bpf_func_state { - struct bpf_reg_state regs[MAX_BPF_REG]; - ... - struct bpf_stack_state *stack; - } - - struct bpf_verifier_state { - struct bpf_func_state *frame[MAX_CALL_FRAMES]; - struct bpf_verifier_state *parent; - ... - } - -* ``REG_LIVE_NONE`` is an initial value assigned to ``->live`` fields upon new - verifier state creation; - -* ``REG_LIVE_WRITTEN`` means that the value of the register (or stack slot) is - defined by some instruction verified between this verifier state's parent and - verifier state itself; - -* ``REG_LIVE_READ{32,64}`` means that the value of the register (or stack slot) - is read by a some child state of this verifier state; - -* ``REG_LIVE_DONE`` is a marker used by ``clean_verifier_state()`` to avoid - processing same verifier state multiple times and for some sanity checks; - -* ``->live`` field values are formed by combining ``enum bpf_reg_liveness`` - values using bitwise or. - -Register parentage chains -~~~~~~~~~~~~~~~~~~~~~~~~~ - -In order to propagate information between parent and child states, a *register -parentage chain* is established. Each register or stack slot is linked to a -corresponding register or stack slot in its parent state via a ``->parent`` -pointer. This link is established upon state creation in ``is_state_visited()`` -and might be modified by ``set_callee_state()`` called from -``__check_func_call()``. - -The rules for correspondence between registers / stack slots are as follows: - -* For the current stack frame, registers and stack slots of the new state are - linked to the registers and stack slots of the parent state with the same - indices. - -* For the outer stack frames, only callee saved registers (r6-r9) and stack - slots are linked to the registers and stack slots of the parent state with the - same indices. - -* When function call is processed a new ``struct bpf_func_state`` instance is - allocated, it encapsulates a new set of registers and stack slots. For this - new frame, parent links for r6-r9 and stack slots are set to nil, parent links - for r1-r5 are set to match caller r1-r5 parent links. - -This could be illustrated by the following diagram (arrows stand for -``->parent`` pointers):: - - ... ; Frame #0, some instructions - --- checkpoint #0 --- - 1 : r6 = 42 ; Frame #0 - --- checkpoint #1 --- - 2 : call foo() ; Frame #0 - ... ; Frame #1, instructions from foo() - --- checkpoint #2 --- - ... ; Frame #1, instructions from foo() - --- checkpoint #3 --- - exit ; Frame #1, return from foo() - 3 : r1 = r6 ; Frame #0 <- current state - - +-------------------------------+-------------------------------+ - | Frame #0 | Frame #1 | - Checkpoint +-------------------------------+-------------------------------+ - #0 | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - ^ ^ ^ ^ - | | | | - Checkpoint +-------------------------------+ - #1 | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - ^ ^ ^ - |_______|_______|_______________ - | | | - nil nil | | | nil nil - | | | | | | | - Checkpoint +-------------------------------+-------------------------------+ - #2 | r0 | r1-r5 | r6-r9 | fp-8 ... | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+-------------------------------+ - ^ ^ ^ ^ ^ - nil nil | | | | | - | | | | | | | - Checkpoint +-------------------------------+-------------------------------+ - #3 | r0 | r1-r5 | r6-r9 | fp-8 ... | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+-------------------------------+ - ^ ^ - nil nil | | - | | | | - Current +-------------------------------+ - state | r0 | r1-r5 | r6-r9 | fp-8 ... | - +-------------------------------+ - \ - r6 read mark is propagated via these links - all the way up to checkpoint #1. - The checkpoint #1 contains a write mark for r6 - because of instruction (1), thus read propagation - does not reach checkpoint #0 (see section below). - -Liveness marks tracking -~~~~~~~~~~~~~~~~~~~~~~~ - -For each processed instruction, the verifier tracks read and written registers -and stack slots. The main idea of the algorithm is that read marks propagate -back along the state parentage chain until they hit a write mark, which 'screens -off' earlier states from the read. The information about reads is propagated by -function ``mark_reg_read()`` which could be summarized as follows:: - - mark_reg_read(struct bpf_reg_state *state, ...): - parent = state->parent - while parent: - if state->live & REG_LIVE_WRITTEN: - break - if parent->live & REG_LIVE_READ64: - break - parent->live |= REG_LIVE_READ64 - state = parent - parent = state->parent - -Notes: - -* The read marks are applied to the **parent** state while write marks are - applied to the **current** state. The write mark on a register or stack slot - means that it is updated by some instruction in the straight-line code leading - from the parent state to the current state. - -* Details about REG_LIVE_READ32 are omitted. - -* Function ``propagate_liveness()`` (see section :ref:`read_marks_for_cache_hits`) - might override the first parent link. Please refer to the comments in the - ``propagate_liveness()`` and ``mark_reg_read()`` source code for further - details. - -Because stack writes could have different sizes ``REG_LIVE_WRITTEN`` marks are -applied conservatively: stack slots are marked as written only if write size -corresponds to the size of the register, e.g. see function ``save_register_state()``. - -Consider the following example:: - - 0: (*u64)(r10 - 8) = 0 ; define 8 bytes of fp-8 - --- checkpoint #0 --- - 1: (*u32)(r10 - 8) = 1 ; redefine lower 4 bytes - 2: r1 = (*u32)(r10 - 8) ; read lower 4 bytes defined at (1) - 3: r2 = (*u32)(r10 - 4) ; read upper 4 bytes defined at (0) - -As stated above, the write at (1) does not count as ``REG_LIVE_WRITTEN``. Should -it be otherwise, the algorithm above wouldn't be able to propagate the read mark -from (3) to checkpoint #0. - -Once the ``BPF_EXIT`` instruction is reached ``update_branch_counts()`` is -called to update the ``->branches`` counter for each verifier state in a chain -of parent verifier states. When the ``->branches`` counter reaches zero the -verifier state becomes a valid entry in a set of cached verifier states. - -Each entry of the verifier states cache is post-processed by a function -``clean_live_states()``. This function marks all registers and stack slots -without ``REG_LIVE_READ{32,64}`` marks as ``NOT_INIT`` or ``STACK_INVALID``. -Registers/stack slots marked in this way are ignored in function ``stacksafe()`` -called from ``states_equal()`` when a state cache entry is considered for -equivalence with a current state. - -Now it is possible to explain how the example from the beginning of the section -works:: - - 0: call bpf_get_prandom_u32() - 1: r1 = 0 - 2: if r0 == 0 goto +1 - 3: r0 = 1 - --- checkpoint[0] --- - 4: r0 = r1 - 5: exit - -* At instruction #2 branching point is reached and state ``{ r0 == 0, r1 == 0, pc == 4 }`` - is pushed to states processing queue (pc stands for program counter). - -* At instruction #4: - - * ``checkpoint[0]`` states cache entry is created: ``{ r0 == 1, r1 == 0, pc == 4 }``; - * ``checkpoint[0].r0`` is marked as written; - * ``checkpoint[0].r1`` is marked as read; - -* At instruction #5 exit is reached and ``checkpoint[0]`` can now be processed - by ``clean_live_states()``. After this processing ``checkpoint[0].r1`` has a - read mark and all other registers and stack slots are marked as ``NOT_INIT`` - or ``STACK_INVALID`` - -* The state ``{ r0 == 0, r1 == 0, pc == 4 }`` is popped from the states queue - and is compared against a cached state ``{ r1 == 0, pc == 4 }``, the states - are considered equivalent. - -.. _read_marks_for_cache_hits: - -Read marks propagation for cache hits -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Another point is the handling of read marks when a previously verified state is -found in the states cache. Upon cache hit verifier must behave in the same way -as if the current state was verified to the program exit. This means that all -read marks, present on registers and stack slots of the cached state, must be -propagated over the parentage chain of the current state. Example below shows -why this is important. Function ``propagate_liveness()`` handles this case. - -Consider the following state parentage chain (S is a starting state, A-E are -derived states, -> arrows show which state is derived from which):: - - r1 read - <------------- A[r1] == 0 - C[r1] == 0 - S ---> A ---> B ---> exit E[r1] == 1 - | - ` ---> C ---> D - | - ` ---> E ^ - |___ suppose all these - ^ states are at insn #Y - | - suppose all these - states are at insn #X - -* Chain of states ``S -> A -> B -> exit`` is verified first. - -* While ``B -> exit`` is verified, register ``r1`` is read and this read mark is - propagated up to state ``A``. - -* When chain of states ``C -> D`` is verified the state ``D`` turns out to be - equivalent to state ``B``. - -* The read mark for ``r1`` has to be propagated to state ``C``, otherwise state - ``C`` might get mistakenly marked as equivalent to state ``E`` even though - values for register ``r1`` differ between ``C`` and ``E``. - Understanding eBPF verifier messages ==================================== diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index dec5da3a2e59..c7515da8500c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -26,27 +26,6 @@ /* Patch buffer size */ #define INSN_BUF_SIZE 32 -/* Liveness marks, used for registers and spilled-regs (in stack slots). - * Read marks propagate upwards until they find a write mark; they record that - * "one of this state's descendants read this reg" (and therefore the reg is - * relevant for states_equal() checks). - * Write marks collect downwards and do not propagate; they record that "the - * straight-line code that reached this state (from its parent) wrote this reg" - * (and therefore that reads propagated from this state or its descendants - * should not propagate to its parent). - * A state with a write mark can receive read marks; it just won't propagate - * them to its parent, since the write mark is a property, not of the state, - * but of the link between it and its parent. See mark_reg_read() and - * mark_stack_slot_read() in kernel/bpf/verifier.c. - */ -enum bpf_reg_liveness { - REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ - REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ - REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ - REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, - REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ -}; - #define ITER_PREFIX "bpf_iter_" enum bpf_iter_state { @@ -211,8 +190,6 @@ struct bpf_reg_state { * allowed and has the same effect as bpf_sk_release(sk). */ u32 ref_obj_id; - /* parentage chain for liveness checking */ - struct bpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' @@ -225,7 +202,6 @@ struct bpf_reg_state { * patching which only happens after main verification finished. */ s32 subreg_def; - enum bpf_reg_liveness live; /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ bool precise; }; @@ -852,7 +828,6 @@ struct bpf_verifier_env { /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; - bool internal_error; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 0d6d7bfb2fd0..f50533169cc3 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -542,17 +542,6 @@ static char slot_type_char[] = { [STACK_IRQ_FLAG] = 'f' }; -static void print_liveness(struct bpf_verifier_env *env, - enum bpf_reg_liveness live) -{ - if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) - verbose(env, "_"); - if (live & REG_LIVE_READ) - verbose(env, "r"); - if (live & REG_LIVE_WRITTEN) - verbose(env, "w"); -} - #define UNUM_MAX_DECIMAL U16_MAX #define SNUM_MAX_DECIMAL S16_MAX #define SNUM_MIN_DECIMAL S16_MIN @@ -770,7 +759,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie if (!print_all && !reg_scratched(env, i)) continue; verbose(env, " R%d", i); - print_liveness(env, reg->live); verbose(env, "="); print_reg_state(env, state, reg); } @@ -803,9 +791,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie break; types_buf[j] = '\0'; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf); print_reg_state(env, state, reg); break; case STACK_DYNPTR: @@ -814,7 +800,6 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie reg = &state->stack[i].spilled_ptr; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); if (reg->id) verbose_a("id=%d", reg->id); @@ -829,9 +814,8 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie if (!reg->ref_obj_id) continue; - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", + verbose(env, " fp%d=iter_%s(ref_id=%d,state=%s,depth=%u)", + (-i - 1) * BPF_REG_SIZE, iter_type_str(reg->iter.btf, reg->iter.btf_id), reg->ref_obj_id, iter_state_str(reg->iter.state), reg->iter.depth); @@ -839,9 +823,7 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifie case STACK_MISC: case STACK_ZERO: default: - verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); - print_liveness(env, reg->live); - verbose(env, "=%s", types_buf); + verbose(env, " fp%d=%s", (-i - 1) * BPF_REG_SIZE, types_buf); break; } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f70e34a38c13..e1da2471442b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -787,8 +787,6 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; @@ -806,29 +804,6 @@ static void invalidate_dynptr(struct bpf_verifier_env *env, struct bpf_func_stat __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - /* Why do we need to set REG_LIVE_WRITTEN for STACK_INVALID slot? - * - * While we don't allow reading STACK_INVALID, it is still possible to - * do <8 byte writes marking some but not all slots as STACK_MISC. Then, - * helpers or insns can do partial read of that part without failing, - * but check_stack_range_initialized, check_stack_read_var_off, and - * check_stack_read_fixed_off will do mark_reg_read for all 8-bytes of - * the slot conservatively. Hence we need to prevent those liveness - * marking walks. - * - * This was not a problem before because STACK_INVALID is only set by - * default (where the default reg state has its reg->parent as NULL), or - * in clean_live_states after REG_LIVE_DONE (at which point - * mark_reg_read won't walk reg->parent chain), but not randomly during - * verifier state exploration (like we did above). Hence, for our case - * parentage chain will still be live (i.e. reg->parent may be - * non-NULL), while earlier reg->parent was NULL, so we need - * REG_LIVE_WRITTEN to screen off read marker propagation when it is - * done later on reads or by mark_dynptr_read as well to unnecessary - * mark registers in verifier state. - */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); } @@ -938,9 +913,6 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env, __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); - /* Same reason as unmark_stack_slots_dynptr above */ - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - state->stack[spi - 1].spilled_ptr.live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, state->frameno, BIT(spi - 1) | BIT(spi)); return 0; @@ -1059,7 +1031,6 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env, else st->type |= PTR_UNTRUSTED; } - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = i == 0 ? id : 0; st->iter.btf = btf; st->iter.btf_id = btf_id; @@ -1095,9 +1066,6 @@ static int unmark_stack_slots_iter(struct bpf_verifier_env *env, __mark_reg_not_init(env, st); - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; - for (j = 0; j < BPF_REG_SIZE; j++) slot->slot_type[j] = STACK_INVALID; @@ -1194,7 +1162,6 @@ static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env, bpf_mark_stack_write(env, reg->frameno, BIT(spi)); __mark_reg_known_zero(st); st->type = PTR_TO_STACK; /* we don't have dedicated reg type */ - st->live |= REG_LIVE_WRITTEN; st->ref_obj_id = id; st->irq.kfunc_class = kfunc_class; @@ -1248,8 +1215,6 @@ static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_r __mark_reg_not_init(env, st); - /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */ - st->live |= REG_LIVE_WRITTEN; bpf_mark_stack_write(env, reg->frameno, BIT(spi)); for (i = 0; i < BPF_REG_SIZE; i++) @@ -2901,8 +2866,6 @@ static void init_reg_state(struct bpf_verifier_env *env, for (i = 0; i < MAX_BPF_REG; i++) { mark_reg_not_init(env, regs, i); - regs[i].live = REG_LIVE_NONE; - regs[i].parent = NULL; regs[i].subreg_def = DEF_NOT_SUBREG; } @@ -3583,64 +3546,12 @@ next: return 0; } -/* Parentage chain of this register (or stack slot) should take care of all - * issues like callee-saved registers, stack slot allocation time, etc. - */ -static int mark_reg_read(struct bpf_verifier_env *env, - const struct bpf_reg_state *state, - struct bpf_reg_state *parent, u8 flag) -{ - bool writes = parent == state->parent; /* Observe write marks */ - int cnt = 0; - - while (parent) { - /* if read wasn't screened by an earlier write ... */ - if (writes && state->live & REG_LIVE_WRITTEN) - break; - /* The first condition is more likely to be true than the - * second, checked it first. - */ - if ((parent->live & REG_LIVE_READ) == flag || - parent->live & REG_LIVE_READ64) - /* The parentage chain never changes and - * this parent was already marked as LIVE_READ. - * There is no need to keep walking the chain again and - * keep re-marking all parents as LIVE_READ. - * This case happens when the same register is read - * multiple times without writes into it in-between. - * Also, if parent has the stronger REG_LIVE_READ64 set, - * then no need to set the weak REG_LIVE_READ32. - */ - break; - /* ... then we depend on parent's value */ - parent->live |= flag; - /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ - if (flag == REG_LIVE_READ64) - parent->live &= ~REG_LIVE_READ32; - state = parent; - parent = state->parent; - writes = true; - cnt++; - } - - if (env->longest_mark_read_walk < cnt) - env->longest_mark_read_walk = cnt; - return 0; -} - static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi, int nr_slots) { - struct bpf_func_state *state = func(env, reg); int err, i; for (i = 0; i < nr_slots; i++) { - struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr; - - err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64); - if (err) - return err; - err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi - i)); if (err) return err; @@ -3852,15 +3763,13 @@ static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *r if (rw64) mark_insn_zext(env, reg); - return mark_reg_read(env, reg, reg->parent, - rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); + return 0; } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); @@ -5065,12 +4974,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, /* Copy src state preserving dst->parent and dst->live fields */ static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src) { - struct bpf_reg_state *parent = dst->parent; - enum bpf_reg_liveness live = dst->live; - *dst = *src; - dst->parent = parent; - dst->live = live; } static void save_register_state(struct bpf_verifier_env *env, @@ -5081,8 +4985,6 @@ static void save_register_state(struct bpf_verifier_env *env, int i; copy_register_state(&state->stack[spi].spilled_ptr, reg); - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = BPF_REG_SIZE; i > BPF_REG_SIZE - size; i--) state->stack[spi].slot_type[i - 1] = STACK_SPILL; @@ -5231,17 +5133,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, for (i = 0; i < BPF_REG_SIZE; i++) scrub_spilled_slot(&state->stack[spi].slot_type[i]); - /* only mark the slot as written if all 8 bytes were written - * otherwise read propagation may incorrectly stop too soon - * when stack slots are partially written. - * This heuristic means that read propagation will be - * conservative, since it will add reg_live_read marks - * to stack slots all the way to first state when programs - * writes+reads less than 8 bytes - */ - if (size == BPF_REG_SIZE) - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - /* when we zero initialize stack slots mark them as such */ if ((reg && register_is_null(reg)) || (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { @@ -5434,7 +5325,6 @@ static void mark_reg_stack_read(struct bpf_verifier_env *env, /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, dst_regno); } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } /* Read the stack at 'off' and put the results into the register indicated by @@ -5481,7 +5371,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno < 0) return 0; @@ -5535,7 +5424,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* not restoring original register state */ } } - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (dst_regno >= 0) { /* restore register state from stack */ copy_register_state(&state->regs[dst_regno], reg); @@ -5543,7 +5431,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE @@ -5555,7 +5442,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { for (i = 0; i < size; i++) { type = stype[(slot - i) % BPF_REG_SIZE]; @@ -5569,7 +5455,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (dst_regno >= 0) mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); insn_flags = 0; /* we are not restoring spilled register */ @@ -8197,13 +8082,10 @@ mark: /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ - mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent, - REG_LIVE_READ64); err = bpf_mark_stack_read(env, reg->frameno, env->insn_idx, BIT(spi)); if (err) return err; - /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not + /* We do not call bpf_mark_stack_write(), as we can not * be sure that whether stack slot is written to or not. Hence, * we must still conservatively propagate reads upwards even if * helper may write to the entire memory range. @@ -11041,8 +10923,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* we are going to rely on register's precise value */ - err = mark_reg_read(env, r0, r0->parent, REG_LIVE_READ64); - err = err ?: mark_chain_precision(env, BPF_REG_0); + err = mark_chain_precision(env, BPF_REG_0); if (err) return err; @@ -11946,17 +11827,11 @@ static void __mark_btf_func_reg_size(struct bpf_verifier_env *env, struct bpf_re if (regno == BPF_REG_0) { /* Function return value */ - reg->live |= REG_LIVE_WRITTEN; reg->subreg_def = reg_size == sizeof(u64) ? DEF_NOT_SUBREG : env->insn_idx + 1; - } else { + } else if (reg_size == sizeof(u64)) { /* Function argument */ - if (reg_size == sizeof(u64)) { - mark_insn_zext(env, reg); - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - } else { - mark_reg_read(env, reg, reg->parent, REG_LIVE_READ32); - } + mark_insn_zext(env, reg); } } @@ -15710,7 +15585,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ assign_scalar_id_before_mov(env, src_reg); copy_register_state(dst_reg, src_reg); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* case: R1 = (s8, s16 s32)R2 */ @@ -15729,7 +15603,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (!no_sext) dst_reg->id = 0; coerce_reg_to_size_sx(dst_reg, insn->off >> 3); - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = DEF_NOT_SUBREG; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -15755,7 +15628,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ if (!is_src_reg_u32) dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; } else { /* case: W1 = (s8, s16)W2 */ @@ -15766,7 +15638,6 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) copy_register_state(dst_reg, src_reg); if (!no_sext) dst_reg->id = 0; - dst_reg->live |= REG_LIVE_WRITTEN; dst_reg->subreg_def = env->insn_idx + 1; coerce_subreg_to_size_sx(dst_reg, insn->off >> 3); } @@ -18576,11 +18447,6 @@ static void clean_func_state(struct bpf_verifier_env *env, for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { if (!bpf_stack_slot_alive(env, st->frameno, i)) { - if (st->stack[i].spilled_ptr.live & REG_LIVE_READ) { - verifier_bug(env, "incorrect live marks #1 for insn %d frameno %d spi %d\n", - env->insn_idx, st->frameno, i); - env->internal_error = true; - } __mark_reg_not_init(env, &st->stack[i].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) st->stack[i].slot_type[j] = STACK_INVALID; @@ -18609,25 +18475,23 @@ static void clean_verifier_state(struct bpf_verifier_env *env, * but a lot of states will get revised from liveness point of view when * the verifier explores other branches. * Example: - * 1: r0 = 1 + * 1: *(u64)(r10 - 8) = 1 * 2: if r1 == 100 goto pc+1 - * 3: r0 = 2 - * 4: exit - * when the verifier reaches exit insn the register r0 in the state list of - * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch - * of insn 2 and goes exploring further. At the insn 4 it will walk the - * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. + * 3: *(u64)(r10 - 8) = 2 + * 4: r0 = *(u64)(r10 - 8) + * 5: exit + * when the verifier reaches exit insn the stack slot -8 in the state list of + * insn 2 is not yet marked alive. Then the verifier pops the other_branch + * of insn 2 and goes exploring further. After the insn 4 read, liveness + * analysis would propagate read mark for -8 at insn 2. * * Since the verifier pushes the branch states as it sees them while exploring * the program the condition of walking the branch instruction for the second * time means that all states below this branch were already explored and * their final liveness marks are already propagated. * Hence when the verifier completes the search of state list in is_state_visited() - * we can call this clean_live_states() function to mark all liveness states - * as st->cleaned to indicate that 'parent' pointers of 'struct bpf_reg_state' - * will not be used. - * This function also clears the registers and stack for states that !READ - * to simplify state merging. + * we can call this clean_live_states() function to clear dead the registers and stack + * slots to simplify state merging. * * Important note here that walking the same branch instruction in the callee * doesn't meant that the states are DONE. The verifier has to compare @@ -18802,7 +18666,6 @@ static struct bpf_reg_state unbound_reg; static __init int unbound_reg_init(void) { __mark_reg_unknown_imprecise(&unbound_reg); - unbound_reg.live |= REG_LIVE_READ; return 0; } late_initcall(unbound_reg_init); @@ -19097,91 +18960,6 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } -/* Return 0 if no propagation happened. Return negative error code if error - * happened. Otherwise, return the propagated bit. - */ -static int propagate_liveness_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - struct bpf_reg_state *parent_reg) -{ - u8 parent_flag = parent_reg->live & REG_LIVE_READ; - u8 flag = reg->live & REG_LIVE_READ; - int err; - - /* When comes here, read flags of PARENT_REG or REG could be any of - * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need - * of propagation if PARENT_REG has strongest REG_LIVE_READ64. - */ - if (parent_flag == REG_LIVE_READ64 || - /* Or if there is no read flag from REG. */ - !flag || - /* Or if the read flag from REG is the same as PARENT_REG. */ - parent_flag == flag) - return 0; - - err = mark_reg_read(env, reg, parent_reg, flag); - if (err) - return err; - - return flag; -} - -/* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at an - * equivalent state (jump target or such) we didn't arrive by the straight-line - * code, so read marks in the state must propagate to the parent regardless - * of the state's write marks. That's what 'parent == state->parent' comparison - * in mark_reg_read() is for. - */ -static int propagate_liveness(struct bpf_verifier_env *env, - const struct bpf_verifier_state *vstate, - struct bpf_verifier_state *vparent, - bool *changed) -{ - struct bpf_reg_state *state_reg, *parent_reg; - struct bpf_func_state *state, *parent; - int i, frame, err = 0; - bool tmp = false; - - changed = changed ?: &tmp; - if (vparent->curframe != vstate->curframe) { - WARN(1, "propagate_live: parent frame %d current frame %d\n", - vparent->curframe, vstate->curframe); - return -EFAULT; - } - /* Propagate read liveness of registers... */ - BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - for (frame = 0; frame <= vstate->curframe; frame++) { - parent = vparent->frame[frame]; - state = vstate->frame[frame]; - parent_reg = parent->regs; - state_reg = state->regs; - /* We don't need to worry about FP liveness, it's read-only */ - for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { - err = propagate_liveness_reg(env, &state_reg[i], - &parent_reg[i]); - if (err < 0) - return err; - *changed |= err > 0; - if (err == REG_LIVE_READ64) - mark_insn_zext(env, &parent_reg[i]); - } - - /* Propagate stack slots. */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - parent_reg = &parent->stack[i].spilled_ptr; - state_reg = &state->stack[i].spilled_ptr; - err = propagate_liveness_reg(env, state_reg, - parent_reg); - *changed |= err > 0; - if (err < 0) - return err; - } - } - return 0; -} - /* find precise scalars in the previous equivalent state and * propagate them into the current state */ @@ -19201,8 +18979,7 @@ static int propagate_precision(struct bpf_verifier_env *env, first = true; for (i = 0; i < BPF_REG_FP; i++, state_reg++) { if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) + !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) @@ -19219,8 +18996,7 @@ static int propagate_precision(struct bpf_verifier_env *env, continue; state_reg = &state->stack[i].spilled_ptr; if (state_reg->type != SCALAR_VALUE || - !state_reg->precise || - !(state_reg->live & REG_LIVE_READ)) + !state_reg->precise) continue; if (env->log.level & BPF_LOG_LEVEL2) { if (first) @@ -19270,9 +19046,6 @@ static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visi changed = false; for (backedge = visit->backedges; backedge; backedge = backedge->next) { st = &backedge->state; - err = propagate_liveness(env, st->equal_state, st, &changed); - if (err) - return err; err = propagate_precision(env, st->equal_state, st, &changed); if (err) return err; @@ -19296,7 +19069,7 @@ static bool states_maybe_looping(struct bpf_verifier_state *old, fcur = cur->frame[fr]; for (i = 0; i < MAX_BPF_REG; i++) if (memcmp(&fold->regs[i], &fcur->regs[i], - offsetof(struct bpf_reg_state, parent))) + offsetof(struct bpf_reg_state, frameno))) return false; return true; } @@ -19394,7 +19167,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state, *new; bool force_new_state, add_new_state, loop; - int i, j, n, err, states_cnt = 0; + int n, err, states_cnt = 0; struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || @@ -19551,28 +19324,16 @@ skip_inf_loop_check: loop = incomplete_read_marks(env, &sl->state); if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: - if (env->internal_error) - return -EFAULT; sl->hit_cnt++; - /* reached equivalent register/stack state, - * prune the search. - * Registers read by the continuation are read by us. - * If we have any write marks in env->cur_state, they - * will prevent corresponding reads in the continuation - * from reaching our parent (an explored_state). Our - * own state will get the read marks recorded, but - * they'll be immediately forgotten as we're pruning - * this state and will pop a new one. - */ - err = propagate_liveness(env, &sl->state, cur, NULL); /* if previous state reached the exit with precision and * current state is equivalent to it (except precision marks) * the precision needs to be propagated back in * the current state. */ + err = 0; if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_jmp_history(env, cur, 0, 0); + err = push_jmp_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; @@ -19667,8 +19428,6 @@ hit: return 1; } miss: - if (env->internal_error) - return -EFAULT; /* when new state is not going to be added do not increase miss count. * Otherwise several loop iterations will remove the state * recorded earlier. The goal of these heuristics is to have @@ -19754,38 +19513,6 @@ miss: cur->dfs_depth = new->dfs_depth + 1; clear_jmp_history(cur); list_add(&new_sl->node, head); - - /* connect new state to parentage chain. Current frame needs all - * registers connected. Only r6 - r9 of the callers are alive (pushed - * to the stack implicitly by JITs) so in callers' frames connect just - * r6 - r9 as an optimization. Callers will have r1 - r5 connected to - * the state of the call instruction (with WRITTEN set), and r0 comes - * from callee with its full parentage chain, anyway. - */ - /* clear write marks in current state: the writes we did are not writes - * our child did, so they don't screen off its reads from us. - * (There are no read marks in current state, because reads always mark - * their parent and current state never has children yet. Only - * explored_states can get read marks.) - */ - for (j = 0; j <= cur->curframe; j++) { - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].live = REG_LIVE_NONE; - } - - /* all stack frames are accessible from callee, clear them all */ - for (j = 0; j <= cur->curframe; j++) { - struct bpf_func_state *frame = cur->frame[j]; - struct bpf_func_state *newframe = new->frame[j]; - - for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; - frame->stack[i].spilled_ptr.parent = - &newframe->stack[i].spilled_ptr; - } - } return 0; } diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 1d53a8561ee2..24c509ce4e5b 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -42,11 +42,11 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "2"}, - {1, "R3_w", "4"}, - {2, "R3_w", "8"}, - {3, "R3_w", "16"}, - {4, "R3_w", "32"}, + {0, "R3", "2"}, + {1, "R3", "4"}, + {2, "R3", "8"}, + {3, "R3", "16"}, + {4, "R3", "32"}, }, }, { @@ -70,17 +70,17 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "1"}, - {1, "R3_w", "2"}, - {2, "R3_w", "4"}, - {3, "R3_w", "8"}, - {4, "R3_w", "16"}, - {5, "R3_w", "1"}, - {6, "R4_w", "32"}, - {7, "R4_w", "16"}, - {8, "R4_w", "8"}, - {9, "R4_w", "4"}, - {10, "R4_w", "2"}, + {0, "R3", "1"}, + {1, "R3", "2"}, + {2, "R3", "4"}, + {3, "R3", "8"}, + {4, "R3", "16"}, + {5, "R3", "1"}, + {6, "R4", "32"}, + {7, "R4", "16"}, + {8, "R4", "8"}, + {9, "R4", "4"}, + {10, "R4", "2"}, }, }, { @@ -99,12 +99,12 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "4"}, - {1, "R3_w", "8"}, - {2, "R3_w", "10"}, - {3, "R4_w", "8"}, - {4, "R4_w", "12"}, - {5, "R4_w", "14"}, + {0, "R3", "4"}, + {1, "R3", "8"}, + {2, "R3", "10"}, + {3, "R4", "8"}, + {4, "R4", "12"}, + {5, "R4", "14"}, }, }, { @@ -121,10 +121,10 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1", "ctx()"}, {0, "R10", "fp0"}, - {0, "R3_w", "7"}, - {1, "R3_w", "7"}, - {2, "R3_w", "14"}, - {3, "R3_w", "56"}, + {0, "R3", "7"}, + {1, "R3", "7"}, + {2, "R3", "14"}, + {3, "R3", "56"}, }, }, @@ -162,19 +162,19 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R0_w", "pkt(off=8,r=8)"}, - {6, "R3_w", "var_off=(0x0; 0xff)"}, - {7, "R3_w", "var_off=(0x0; 0x1fe)"}, - {8, "R3_w", "var_off=(0x0; 0x3fc)"}, - {9, "R3_w", "var_off=(0x0; 0x7f8)"}, - {10, "R3_w", "var_off=(0x0; 0xff0)"}, - {12, "R3_w", "pkt_end()"}, - {17, "R4_w", "var_off=(0x0; 0xff)"}, - {18, "R4_w", "var_off=(0x0; 0x1fe0)"}, - {19, "R4_w", "var_off=(0x0; 0xff0)"}, - {20, "R4_w", "var_off=(0x0; 0x7f8)"}, - {21, "R4_w", "var_off=(0x0; 0x3fc)"}, - {22, "R4_w", "var_off=(0x0; 0x1fe)"}, + {6, "R0", "pkt(off=8,r=8)"}, + {6, "R3", "var_off=(0x0; 0xff)"}, + {7, "R3", "var_off=(0x0; 0x1fe)"}, + {8, "R3", "var_off=(0x0; 0x3fc)"}, + {9, "R3", "var_off=(0x0; 0x7f8)"}, + {10, "R3", "var_off=(0x0; 0xff0)"}, + {12, "R3", "pkt_end()"}, + {17, "R4", "var_off=(0x0; 0xff)"}, + {18, "R4", "var_off=(0x0; 0x1fe0)"}, + {19, "R4", "var_off=(0x0; 0xff0)"}, + {20, "R4", "var_off=(0x0; 0x7f8)"}, + {21, "R4", "var_off=(0x0; 0x3fc)"}, + {22, "R4", "var_off=(0x0; 0x1fe)"}, }, }, { @@ -195,16 +195,16 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {6, "R3_w", "var_off=(0x0; 0xff)"}, - {7, "R4_w", "var_off=(0x0; 0xff)"}, - {8, "R4_w", "var_off=(0x0; 0xff)"}, - {9, "R4_w", "var_off=(0x0; 0xff)"}, - {10, "R4_w", "var_off=(0x0; 0x1fe)"}, - {11, "R4_w", "var_off=(0x0; 0xff)"}, - {12, "R4_w", "var_off=(0x0; 0x3fc)"}, - {13, "R4_w", "var_off=(0x0; 0xff)"}, - {14, "R4_w", "var_off=(0x0; 0x7f8)"}, - {15, "R4_w", "var_off=(0x0; 0xff0)"}, + {6, "R3", "var_off=(0x0; 0xff)"}, + {7, "R4", "var_off=(0x0; 0xff)"}, + {8, "R4", "var_off=(0x0; 0xff)"}, + {9, "R4", "var_off=(0x0; 0xff)"}, + {10, "R4", "var_off=(0x0; 0x1fe)"}, + {11, "R4", "var_off=(0x0; 0xff)"}, + {12, "R4", "var_off=(0x0; 0x3fc)"}, + {13, "R4", "var_off=(0x0; 0xff)"}, + {14, "R4", "var_off=(0x0; 0x7f8)"}, + {15, "R4", "var_off=(0x0; 0xff0)"}, }, }, { @@ -235,14 +235,14 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {2, "R5_w", "pkt(r=0)"}, - {4, "R5_w", "pkt(off=14,r=0)"}, - {5, "R4_w", "pkt(off=14,r=0)"}, + {2, "R5", "pkt(r=0)"}, + {4, "R5", "pkt(off=14,r=0)"}, + {5, "R4", "pkt(off=14,r=0)"}, {9, "R2", "pkt(r=18)"}, {10, "R5", "pkt(off=14,r=18)"}, - {10, "R4_w", "var_off=(0x0; 0xff)"}, - {13, "R4_w", "var_off=(0x0; 0xffff)"}, - {14, "R4_w", "var_off=(0x0; 0xffff)"}, + {10, "R4", "var_off=(0x0; 0xff)"}, + {13, "R4", "var_off=(0x0; 0xffff)"}, + {14, "R4", "var_off=(0x0; 0xffff)"}, }, }, { @@ -299,12 +299,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {7, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {7, "R6", "var_off=(0x0; 0x3fc)"}, /* Offset is added to packet pointer R5, resulting in * known fixed offset, and variable offset from R6. */ - {11, "R5_w", "pkt(id=1,off=14,"}, + {11, "R5", "pkt(id=1,off=14,"}, /* At the time the word size load is performed from R5, * it's total offset is NET_IP_ALIGN + reg->off (0) + * reg->aux_off (14) which is 16. Then the variable @@ -320,12 +320,12 @@ static struct bpf_align_test tests[] = { * instruction to validate R5 state. We also check * that R4 is what it should be in such case. */ - {18, "R4_w", "var_off=(0x0; 0x3fc)"}, - {18, "R5_w", "var_off=(0x0; 0x3fc)"}, + {18, "R4", "var_off=(0x0; 0x3fc)"}, + {18, "R5", "var_off=(0x0; 0x3fc)"}, /* Constant offset is added to R5, resulting in * reg->off of 14. */ - {19, "R5_w", "pkt(id=2,off=14,"}, + {19, "R5", "pkt(id=2,off=14,"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off * (14) which is 16. Then the variable offset is 4-byte @@ -337,21 +337,21 @@ static struct bpf_align_test tests[] = { /* Constant offset is added to R5 packet pointer, * resulting in reg->off value of 14. */ - {26, "R5_w", "pkt(off=14,r=8)"}, + {26, "R5", "pkt(off=14,r=8)"}, /* Variable offset is added to R5, resulting in a * variable offset of (4n). See comment for insn #18 * for R4 = R5 trick. */ - {28, "R4_w", "var_off=(0x0; 0x3fc)"}, - {28, "R5_w", "var_off=(0x0; 0x3fc)"}, + {28, "R4", "var_off=(0x0; 0x3fc)"}, + {28, "R5", "var_off=(0x0; 0x3fc)"}, /* Constant is added to R5 again, setting reg->off to 18. */ - {29, "R5_w", "pkt(id=3,off=18,"}, + {29, "R5", "pkt(id=3,off=18,"}, /* And once more we add a variable; resulting var_off * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {31, "R4_w", "var_off=(0x0; 0x7fc)"}, - {31, "R5_w", "var_off=(0x0; 0x7fc)"}, + {31, "R4", "var_off=(0x0; 0x7fc)"}, + {31, "R5", "var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so @@ -397,12 +397,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {7, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {7, "R6", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ - {8, "R6_w", "var_off=(0x2; 0x7fc)"}, + {8, "R6", "var_off=(0x2; 0x7fc)"}, /* Packet pointer has (4n+2) offset */ - {11, "R5_w", "var_off=(0x2; 0x7fc)"}, + {11, "R5", "var_off=(0x2; 0x7fc)"}, {12, "R4", "var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) @@ -414,11 +414,11 @@ static struct bpf_align_test tests[] = { /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ - {17, "R6_w", "var_off=(0x0; 0x3fc)"}, + {17, "R6", "var_off=(0x0; 0x3fc)"}, /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ - {19, "R5_w", "var_off=(0x2; 0xffc)"}, + {19, "R5", "var_off=(0x2; 0xffc)"}, {20, "R4", "var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) @@ -459,18 +459,18 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .matches = { - {3, "R5_w", "pkt_end()"}, + {3, "R5", "pkt_end()"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {5, "R5_w", "var_off=(0x0; 0xfffffffffffffffc)"}, + {5, "R5", "var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {6, "R5_w", "var_off=(0x2; 0xfffffffffffffffc)"}, + {6, "R5", "var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ {9, "R5", "var_off=(0x2; 0x7ffffffffffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, - {12, "R4_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {11, "R6", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {12, "R4", "var_off=(0x2; 0x7ffffffffffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -478,7 +478,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w", "var_off=(0x2; 0x7ffffffffffffffc)"}, + {15, "R6", "var_off=(0x2; 0x7ffffffffffffffc)"}, } }, { @@ -513,12 +513,12 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {8, "R6_w", "var_off=(0x0; 0x3fc)"}, + {6, "R2", "pkt(r=8)"}, + {8, "R6", "var_off=(0x0; 0x3fc)"}, /* Adding 14 makes R6 be (4n+2) */ - {9, "R6_w", "var_off=(0x2; 0x7fc)"}, + {9, "R6", "var_off=(0x2; 0x7fc)"}, /* New unknown value in R7 is (4n) */ - {10, "R7_w", "var_off=(0x0; 0x3fc)"}, + {10, "R7", "var_off=(0x0; 0x3fc)"}, /* Subtracting it from R6 blows our unsigned bounds */ {11, "R6", "var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ @@ -566,16 +566,16 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {6, "R2_w", "pkt(r=8)"}, - {9, "R6_w", "var_off=(0x0; 0x3c)"}, + {6, "R2", "pkt(r=8)"}, + {9, "R6", "var_off=(0x0; 0x3c)"}, /* Adding 14 makes R6 be (4n+2) */ - {10, "R6_w", "var_off=(0x2; 0x7c)"}, + {10, "R6", "var_off=(0x2; 0x7c)"}, /* Subtracting from packet pointer overflows ubounds */ - {13, "R5_w", "var_off=(0xffffffffffffff82; 0x7c)"}, + {13, "R5", "var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ - {14, "R7_w", "var_off=(0x0; 0x7fc)"}, + {14, "R7", "var_off=(0x0; 0x7fc)"}, /* Adding it to packet pointer gives nice bounds again */ - {16, "R5_w", "var_off=(0x2; 0x7fc)"}, + {16, "R5", "var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index e3ea5dc2f697..254fbfeab06a 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -13,22 +13,22 @@ static struct { const char *err_msg; } spin_lock_fail_tests[] = { { "lock_id_kptr_preserve", - "5: (bf) r1 = r0 ; R0_w=ptr_foo(id=2,ref_obj_id=2) " - "R1_w=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" + "5: (bf) r1 = r0 ; R0=ptr_foo(id=2,ref_obj_id=2) " + "R1=ptr_foo(id=2,ref_obj_id=2) refs=2\n6: (85) call bpf_this_cpu_ptr#154\n" "R1 type=ptr_ expected=percpu_ptr_" }, { "lock_id_global_zero", - "; R1_w=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" + "; R1=map_value(map=.data.A,ks=4,vs=4)\n2: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" - " R0_w=map_value(id=1,map=array_map,ks=4,vs=8)" - " R1_w=map_value(id=1,map=array_map,ks=4,vs=8)\n" + " R0=map_value(id=1,map=array_map,ks=4,vs=8)" + " R1=map_value(id=1,map=array_map,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_innermapval_preserve", "[0-9]\\+: (bf) r1 = r0 ;" " R0=map_value(id=2,ks=4,vs=8)" - " R1_w=map_value(id=2,ks=4,vs=8)\n" + " R1=map_value(id=2,ks=4,vs=8)\n" "[0-9]\\+: (85) call bpf_this_cpu_ptr#154\n" "R1 type=map_value expected=percpu_ptr_" }, { "lock_id_mismatch_kptr_kptr", "bpf_spin_unlock of different lock" }, diff --git a/tools/testing/selftests/bpf/prog_tests/test_veristat.c b/tools/testing/selftests/bpf/prog_tests/test_veristat.c index 367f47e4a936..b38c16b4247f 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_veristat.c +++ b/tools/testing/selftests/bpf/prog_tests/test_veristat.c @@ -75,26 +75,26 @@ static void test_set_global_vars_succeeds(void) " -vl2 > %s", fix->veristat, fix->tmpfile); read(fix->fd, fix->output, fix->sz); - __CHECK_STR("_w=0xf000000000000001 ", "var_s64 = 0xf000000000000001"); - __CHECK_STR("_w=0xfedcba9876543210 ", "var_u64 = 0xfedcba9876543210"); - __CHECK_STR("_w=0x80000000 ", "var_s32 = -0x80000000"); - __CHECK_STR("_w=0x76543210 ", "var_u32 = 0x76543210"); - __CHECK_STR("_w=0x8000 ", "var_s16 = -32768"); - __CHECK_STR("_w=0xecec ", "var_u16 = 60652"); - __CHECK_STR("_w=128 ", "var_s8 = -128"); - __CHECK_STR("_w=255 ", "var_u8 = 255"); - __CHECK_STR("_w=11 ", "var_ea = EA2"); - __CHECK_STR("_w=12 ", "var_eb = EB2"); - __CHECK_STR("_w=13 ", "var_ec = EC2"); - __CHECK_STR("_w=1 ", "var_b = 1"); - __CHECK_STR("_w=170 ", "struct1[2].struct2[1][2].u.var_u8[2]=170"); - __CHECK_STR("_w=0xaaaa ", "union1.var_u16 = 0xaaaa"); - __CHECK_STR("_w=171 ", "arr[3]= 171"); - __CHECK_STR("_w=172 ", "arr[EA2] =172"); - __CHECK_STR("_w=10 ", "enum_arr[EC2]=EA3"); - __CHECK_STR("_w=173 ", "matrix[31][7][11]=173"); - __CHECK_STR("_w=174 ", "struct1[2].struct2[1][2].u.mat[5][3]=174"); - __CHECK_STR("_w=175 ", "struct11[7][5].struct2[0][1].u.mat[3][0]=175"); + __CHECK_STR("=0xf000000000000001 ", "var_s64 = 0xf000000000000001"); + __CHECK_STR("=0xfedcba9876543210 ", "var_u64 = 0xfedcba9876543210"); + __CHECK_STR("=0x80000000 ", "var_s32 = -0x80000000"); + __CHECK_STR("=0x76543210 ", "var_u32 = 0x76543210"); + __CHECK_STR("=0x8000 ", "var_s16 = -32768"); + __CHECK_STR("=0xecec ", "var_u16 = 60652"); + __CHECK_STR("=128 ", "var_s8 = -128"); + __CHECK_STR("=255 ", "var_u8 = 255"); + __CHECK_STR("=11 ", "var_ea = EA2"); + __CHECK_STR("=12 ", "var_eb = EB2"); + __CHECK_STR("=13 ", "var_ec = EC2"); + __CHECK_STR("=1 ", "var_b = 1"); + __CHECK_STR("=170 ", "struct1[2].struct2[1][2].u.var_u8[2]=170"); + __CHECK_STR("=0xaaaa ", "union1.var_u16 = 0xaaaa"); + __CHECK_STR("=171 ", "arr[3]= 171"); + __CHECK_STR("=172 ", "arr[EA2] =172"); + __CHECK_STR("=10 ", "enum_arr[EC2]=EA3"); + __CHECK_STR("=173 ", "matrix[31][7][11]=173"); + __CHECK_STR("=174 ", "struct1[2].struct2[1][2].u.mat[5][3]=174"); + __CHECK_STR("=175 ", "struct11[7][5].struct2[0][1].u.mat[3][0]=175"); out: teardown_fixture(fix); @@ -117,8 +117,8 @@ static void test_set_global_vars_from_file_succeeds(void) SYS(out, "%s set_global_vars.bpf.o -G \"@%s\" -vl2 > %s", fix->veristat, input_file, fix->tmpfile); read(fix->fd, fix->output, fix->sz); - __CHECK_STR("_w=0x8000 ", "var_s16 = -32768"); - __CHECK_STR("_w=0xecec ", "var_u16 = 60652"); + __CHECK_STR("=0x8000 ", "var_s16 = -32768"); + __CHECK_STR("=0xecec ", "var_u16 = 60652"); out: close(fd); diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index 5e0a1ca96d4e..a01c2736890f 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -18,43 +18,43 @@ return *(u64 *)num; \ } -__msg(": R0_w=0xffffffff80000000") +__msg(": R0=0xffffffff80000000") check_assert(s64, ==, eq_int_min, INT_MIN); -__msg(": R0_w=0x7fffffff") +__msg(": R0=0x7fffffff") check_assert(s64, ==, eq_int_max, INT_MAX); -__msg(": R0_w=0") +__msg(": R0=0") check_assert(s64, ==, eq_zero, 0); -__msg(": R0_w=0x8000000000000000 R1_w=0x8000000000000000") +__msg(": R0=0x8000000000000000 R1=0x8000000000000000") check_assert(s64, ==, eq_llong_min, LLONG_MIN); -__msg(": R0_w=0x7fffffffffffffff R1_w=0x7fffffffffffffff") +__msg(": R0=0x7fffffffffffffff R1=0x7fffffffffffffff") check_assert(s64, ==, eq_llong_max, LLONG_MAX); -__msg(": R0_w=scalar(id=1,smax=0x7ffffffe)") +__msg(": R0=scalar(id=1,smax=0x7ffffffe)") check_assert(s64, <, lt_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smax=-1,umin=0x8000000000000000,var_off=(0x8000000000000000; 0x7fffffffffffffff))") check_assert(s64, <, lt_zero, 0); -__msg(": R0_w=scalar(id=1,smax=0xffffffff7fffffff") +__msg(": R0=scalar(id=1,smax=0xffffffff7fffffff") check_assert(s64, <, lt_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smax=0x7fffffff)") +__msg(": R0=scalar(id=1,smax=0x7fffffff)") check_assert(s64, <=, le_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smax=0)") +__msg(": R0=scalar(id=1,smax=0)") check_assert(s64, <=, le_zero, 0); -__msg(": R0_w=scalar(id=1,smax=0xffffffff80000000") +__msg(": R0=scalar(id=1,smax=0xffffffff80000000") check_assert(s64, <=, le_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=0x80000000,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >, gt_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=1,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >, gt_zero, 0); -__msg(": R0_w=scalar(id=1,smin=0xffffffff80000001") +__msg(": R0=scalar(id=1,smin=0xffffffff80000001") check_assert(s64, >, gt_neg, INT_MIN); -__msg(": R0_w=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=umin=0x7fffffff,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >=, ge_pos, INT_MAX); -__msg(": R0_w=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") +__msg(": R0=scalar(id=1,smin=0,umax=0x7fffffffffffffff,var_off=(0x0; 0x7fffffffffffffff))") check_assert(s64, >=, ge_zero, 0); -__msg(": R0_w=scalar(id=1,smin=0xffffffff80000000") +__msg(": R0=scalar(id=1,smin=0xffffffff80000000") check_assert(s64, >=, ge_neg, INT_MIN); SEC("?tc") diff --git a/tools/testing/selftests/bpf/progs/iters_state_safety.c b/tools/testing/selftests/bpf/progs/iters_state_safety.c index b381ac0c736c..d273b46dfc7c 100644 --- a/tools/testing/selftests/bpf/progs/iters_state_safety.c +++ b/tools/testing/selftests/bpf/progs/iters_state_safety.c @@ -30,7 +30,7 @@ int force_clang_to_emit_btf_for_externs(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") int create_and_destroy(void *ctx) { struct bpf_iter_num iter; @@ -196,7 +196,7 @@ int leak_iter_from_subprog_fail(void *ctx) SEC("?raw_tp") __success __log_level(2) -__msg("fp-8_w=iter_num(ref_id=1,state=active,depth=0)") +__msg("fp-8=iter_num(ref_id=1,state=active,depth=0)") int valid_stack_reuse(void *ctx) { struct bpf_iter_num iter; diff --git a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c index 6543d5b6e0a9..83791348bed5 100644 --- a/tools/testing/selftests/bpf/progs/iters_testmod_seq.c +++ b/tools/testing/selftests/bpf/progs/iters_testmod_seq.c @@ -20,7 +20,7 @@ __s64 res_empty; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_empty(const void *ctx) @@ -38,7 +38,7 @@ __s64 res_full; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_full(const void *ctx) @@ -58,7 +58,7 @@ static volatile int zero = 0; SEC("raw_tp/sys_enter") __success __log_level(2) -__msg("fp-16_w=iter_testmod_seq(ref_id=1,state=active,depth=0)") +__msg("fp-16=iter_testmod_seq(ref_id=1,state=active,depth=0)") __msg("fp-16=iter_testmod_seq(ref_id=1,state=drained,depth=0)") __msg("call bpf_iter_testmod_seq_destroy") int testmod_seq_truncated(const void *ctx) diff --git a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c index 4f94c971ae86..3b984b6ae7c0 100644 --- a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c +++ b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c @@ -8,8 +8,8 @@ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r8 = *(u64 *)(r7 +0) ; R7_w=ptr_nameidata(off={{[0-9]+}}) R8_w=rdonly_untrusted_mem(sz=0)") -__msg("r9 = *(u8 *)(r8 +0) ; R8_w=rdonly_untrusted_mem(sz=0) R9_w=scalar") +__msg("r8 = *(u64 *)(r7 +0) ; R7=ptr_nameidata(off={{[0-9]+}}) R8=rdonly_untrusted_mem(sz=0)") +__msg("r9 = *(u8 *)(r8 +0) ; R8=rdonly_untrusted_mem(sz=0) R9=scalar") int btf_id_to_ptr_mem(void *ctx) { struct task_struct *task; diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index fbccc20555f4..0a72e0228ea9 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -926,7 +926,7 @@ l1_%=: r0 = 0; \ SEC("socket") __description("bounds check for non const xor src dst") __success __log_level(2) -__msg("5: (af) r0 ^= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__msg("5: (af) r0 ^= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") __naked void non_const_xor_src_dst(void) { asm volatile (" \ @@ -947,7 +947,7 @@ __naked void non_const_xor_src_dst(void) SEC("socket") __description("bounds check for non const or src dst") __success __log_level(2) -__msg("5: (4f) r0 |= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") +__msg("5: (4f) r0 |= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=431,var_off=(0x0; 0x1af))") __naked void non_const_or_src_dst(void) { asm volatile (" \ @@ -968,7 +968,7 @@ __naked void non_const_or_src_dst(void) SEC("socket") __description("bounds check for non const mul regs") __success __log_level(2) -__msg("5: (2f) r0 *= r6 ; R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") +__msg("5: (2f) r0 *= r6 ; R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=3825,var_off=(0x0; 0xfff))") __naked void non_const_mul_regs(void) { asm volatile (" \ @@ -1241,7 +1241,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("multiply mixed sign bounds. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=0x1bc16d5cd4927ee1,smax=umax=0x1bc16d674ec80000,smax32=0x7ffffeff,umax32=0xfffffeff,var_off=(0x1bc16d4000000000; 0x3ffffffeff))") __naked void mult_mixed0_sign(void) { asm volatile ( @@ -1264,7 +1264,7 @@ __naked void mult_mixed0_sign(void) SEC("tc") __description("multiply mixed sign bounds. test 2") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=smin32=-100,smax=smax32=200)") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=smin32=-100,smax=smax32=200)") __naked void mult_mixed1_sign(void) { asm volatile ( @@ -1287,7 +1287,7 @@ __naked void mult_mixed1_sign(void) SEC("tc") __description("multiply negative bounds") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))") +__msg("r6 *= r7 {{.*}}; R6=scalar(smin=umin=smin32=umin32=0x3ff280b0,smax=umax=smax32=umax32=0x3fff0001,var_off=(0x3ff00000; 0xf81ff))") __naked void mult_sign_bounds(void) { asm volatile ( @@ -1311,7 +1311,7 @@ __naked void mult_sign_bounds(void) SEC("tc") __description("multiply bounds that don't cross signed boundary") __success __log_level(2) -__msg("r8 *= r6 {{.*}}; R6_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8_w=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))") +__msg("r8 *= r6 {{.*}}; R6=scalar(smin=smin32=0,smax=umax=smax32=umax32=11,var_off=(0x0; 0xb)) R8=scalar(smin=0,smax=umax=0x7b96bb0a94a3a7cd,var_off=(0x0; 0x7fffffffffffffff))") __naked void mult_no_sign_crossing(void) { asm volatile ( @@ -1331,7 +1331,7 @@ __naked void mult_no_sign_crossing(void) SEC("tc") __description("multiplication overflow, result in unbounded reg. test 1") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar()") +__msg("r6 *= r7 {{.*}}; R6=scalar()") __naked void mult_unsign_ovf(void) { asm volatile ( @@ -1353,7 +1353,7 @@ __naked void mult_unsign_ovf(void) SEC("tc") __description("multiplication overflow, result in unbounded reg. test 2") __success __log_level(2) -__msg("r6 *= r7 {{.*}}; R6_w=scalar()") +__msg("r6 *= r7 {{.*}}; R6=scalar()") __naked void mult_sign_ovf(void) { asm volatile ( @@ -1376,7 +1376,7 @@ __naked void mult_sign_ovf(void) SEC("socket") __description("64-bit addition, all outcomes overflow") __success __log_level(2) -__msg("5: (0f) r3 += r3 {{.*}} R3_w=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") +__msg("5: (0f) r3 += r3 {{.*}} R3=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe)") __retval(0) __naked void add64_full_overflow(void) { @@ -1396,7 +1396,7 @@ __naked void add64_full_overflow(void) SEC("socket") __description("64-bit addition, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("4: (0f) r3 += r3 {{.*}} R3_w=scalar()") +__msg("4: (0f) r3 += r3 {{.*}} R3=scalar()") __retval(0) __naked void add64_partial_overflow(void) { @@ -1416,7 +1416,7 @@ __naked void add64_partial_overflow(void) SEC("socket") __description("32-bit addition overflow, all outcomes overflow") __success __log_level(2) -__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") +__msg("4: (0c) w3 += w3 {{.*}} R3=scalar(smin=umin=umin32=0x40000000,smax=umax=umax32=0xfffffffe,var_off=(0x0; 0xffffffff))") __retval(0) __naked void add32_full_overflow(void) { @@ -1436,7 +1436,7 @@ __naked void add32_full_overflow(void) SEC("socket") __description("32-bit addition, partial overflow, result in unbounded u32 bounds") __success __log_level(2) -__msg("4: (0c) w3 += w3 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__msg("4: (0c) w3 += w3 {{.*}} R3=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") __retval(0) __naked void add32_partial_overflow(void) { @@ -1456,7 +1456,7 @@ __naked void add32_partial_overflow(void) SEC("socket") __description("64-bit subtraction, all outcomes underflow") __success __log_level(2) -__msg("6: (1f) r3 -= r1 {{.*}} R3_w=scalar(umin=1,umax=0x8000000000000000)") +__msg("6: (1f) r3 -= r1 {{.*}} R3=scalar(umin=1,umax=0x8000000000000000)") __retval(0) __naked void sub64_full_overflow(void) { @@ -1477,7 +1477,7 @@ __naked void sub64_full_overflow(void) SEC("socket") __description("64-bit subtraction, partial overflow, result in unbounded reg") __success __log_level(2) -__msg("3: (1f) r3 -= r2 {{.*}} R3_w=scalar()") +__msg("3: (1f) r3 -= r2 {{.*}} R3=scalar()") __retval(0) __naked void sub64_partial_overflow(void) { @@ -1496,7 +1496,7 @@ __naked void sub64_partial_overflow(void) SEC("socket") __description("32-bit subtraction overflow, all outcomes underflow") __success __log_level(2) -__msg("5: (1c) w3 -= w1 {{.*}} R3_w=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") +__msg("5: (1c) w3 -= w1 {{.*}} R3=scalar(smin=umin=umin32=1,smax=umax=umax32=0x80000000,var_off=(0x0; 0xffffffff))") __retval(0) __naked void sub32_full_overflow(void) { @@ -1517,7 +1517,7 @@ __naked void sub32_full_overflow(void) SEC("socket") __description("32-bit subtraction, partial overflow, result in unbounded u32 bounds") __success __log_level(2) -__msg("3: (1c) w3 -= w2 {{.*}} R3_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") +__msg("3: (1c) w3 -= w2 {{.*}} R3=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))") __retval(0) __naked void sub32_partial_overflow(void) { @@ -1617,7 +1617,7 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, positive overlap") __success __log_level(2) __flag(BPF_F_TEST_REG_INVARIANTS) -__msg("3: (2d) if r0 > r1 {{.*}} R0_w=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f))") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=0,smax=umax=smax32=umax32=127,var_off=(0x0; 0x7f))") __retval(0) __naked void bounds_deduct_positive_overlap(void) { @@ -1650,7 +1650,7 @@ l0_%=: r0 = 0; \ SEC("socket") __description("bounds deduction cross sign boundary, two overlaps") __failure __flag(BPF_F_TEST_REG_INVARIANTS) -__msg("3: (2d) if r0 > r1 {{.*}} R0_w=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") +__msg("3: (2d) if r0 > r1 {{.*}} R0=scalar(smin=smin32=-128,smax=smax32=127,umax=0xffffffffffffff80)") __msg("frame pointer is read only") __naked void bounds_deduct_two_overlaps(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c index 181da86ba5f0..6630a92b1b47 100644 --- a/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c +++ b/tools/testing/selftests/bpf/progs/verifier_global_ptr_args.c @@ -215,7 +215,7 @@ __weak int subprog_untrusted(const volatile struct task_struct *restrict task __ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r1 = {{.*}}; {{.*}}R1_w=trusted_ptr_task_struct()") +__msg("r1 = {{.*}}; {{.*}}R1=trusted_ptr_task_struct()") __msg("Func#1 ('subprog_untrusted') is global and assumed valid.") __msg("Validating subprog_untrusted() func#1...") __msg(": R1=untrusted_ptr_task_struct") @@ -278,7 +278,7 @@ __weak int subprog_enum_untrusted(enum bpf_attach_type *p __arg_untrusted) SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r1 = {{.*}}; {{.*}}R1_w=trusted_ptr_task_struct()") +__msg("r1 = {{.*}}; {{.*}}R1=trusted_ptr_task_struct()") __msg("Func#1 ('subprog_void_untrusted') is global and assumed valid.") __msg("Validating subprog_void_untrusted() func#1...") __msg(": R1=rdonly_untrusted_mem(sz=0)") diff --git a/tools/testing/selftests/bpf/progs/verifier_ldsx.c b/tools/testing/selftests/bpf/progs/verifier_ldsx.c index 52edee41caf6..f087ffb79f20 100644 --- a/tools/testing/selftests/bpf/progs/verifier_ldsx.c +++ b/tools/testing/selftests/bpf/progs/verifier_ldsx.c @@ -65,7 +65,7 @@ __naked void ldsx_s32(void) SEC("socket") __description("LDSX, S8 range checking, privileged") __log_level(2) __success __retval(1) -__msg("R1_w=scalar(smin=smin32=-128,smax=smax32=127)") +__msg("R1=scalar(smin=smin32=-128,smax=smax32=127)") __naked void ldsx_s8_range_priv(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 73fee2aec698..1fe090cd6744 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -144,21 +144,21 @@ SEC("?raw_tp") __success __log_level(2) /* * Without the bug fix there will be no history between "last_idx 3 first_idx 3" - * and "parent state regs=" lines. "R0_w=6" parts are here to help anchor + * and "parent state regs=" lines. "R0=6" parts are here to help anchor * expected log messages to the one specific mark_chain_precision operation. * * This is quite fragile: if verifier checkpointing heuristic changes, this * might need adjusting. */ -__msg("2: (07) r0 += 1 ; R0_w=6") +__msg("2: (07) r0 += 1 ; R0=6") __msg("3: (35) if r0 >= 0xa goto pc+1") __msg("mark_precise: frame0: last_idx 3 first_idx 3 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 2: (07) r0 += 1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (07) r0 += 1") __msg("mark_precise: frame0: regs=r0 stack= before 4: (05) goto pc-4") __msg("mark_precise: frame0: regs=r0 stack= before 3: (35) if r0 >= 0xa goto pc+1") -__msg("mark_precise: frame0: parent state regs= stack=: R0_rw=P4") -__msg("3: R0_w=6") +__msg("mark_precise: frame0: parent state regs= stack=: R0=P4") +__msg("3: R0=6") __naked int state_loop_first_last_equal(void) { asm volatile ( @@ -233,8 +233,8 @@ __naked void bpf_cond_op_not_r10(void) SEC("lsm.s/socket_connect") __success __log_level(2) -__msg("0: (b7) r0 = 1 ; R0_w=1") -__msg("1: (84) w0 = -w0 ; R0_w=0xffffffff") +__msg("0: (b7) r0 = 1 ; R0=1") +__msg("1: (84) w0 = -w0 ; R0=0xffffffff") __msg("mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (84) w0 = -w0") __msg("mark_precise: frame0: regs=r0 stack= before 0: (b7) r0 = 1") @@ -268,8 +268,8 @@ __naked int bpf_neg_3(void) SEC("lsm.s/socket_connect") __success __log_level(2) -__msg("0: (b7) r0 = 1 ; R0_w=1") -__msg("1: (87) r0 = -r0 ; R0_w=-1") +__msg("0: (b7) r0 = 1 ; R0=1") +__msg("1: (87) r0 = -r0 ; R0=-1") __msg("mark_precise: frame0: last_idx 2 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r0 stack= before 1: (87) r0 = -r0") __msg("mark_precise: frame0: regs=r0 stack= before 0: (b7) r0 = 1") diff --git a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c index dba3ca728f6e..c0ce690ddb68 100644 --- a/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c +++ b/tools/testing/selftests/bpf/progs/verifier_scalar_ids.c @@ -353,7 +353,7 @@ __flag(BPF_F_TEST_STATE_FREQ) * collect_linked_regs() can't tie more than 6 registers for a single insn. */ __msg("8: (25) if r0 > 0x7 goto pc+0 ; R0=scalar(id=1") -__msg("9: (bf) r6 = r6 ; R6_w=scalar(id=2") +__msg("9: (bf) r6 = r6 ; R6=scalar(id=2") /* check that r{0-5} are marked precise after 'if' */ __msg("frame0: regs=r0 stack= before 8: (25) if r0 > 0x7 goto pc+0") __msg("frame0: parent state regs=r0,r1,r2,r3,r4,r5 stack=:") @@ -779,12 +779,12 @@ __success __retval(0) /* Check that verifier believes r1/r0 are zero at exit */ __log_level(2) -__msg("4: (77) r1 >>= 32 ; R1_w=0") -__msg("5: (bf) r0 = r1 ; R0_w=0 R1_w=0") +__msg("4: (77) r1 >>= 32 ; R1=0") +__msg("5: (bf) r0 = r1 ; R0=0 R1=0") __msg("6: (95) exit") __msg("from 3 to 4") -__msg("4: (77) r1 >>= 32 ; R1_w=0") -__msg("5: (bf) r0 = r1 ; R0_w=0 R1_w=0") +__msg("4: (77) r1 >>= 32 ; R1=0") +__msg("5: (bf) r0 = r1 ; R0=0 R1=0") __msg("6: (95) exit") /* Verify that statements to randomize upper half of r1 had not been * generated. diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 1e5a511e8494..7a13dbd794b2 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -506,17 +506,17 @@ SEC("raw_tp") __log_level(2) __success /* fp-8 is spilled IMPRECISE value zero (represented by a zero value fake reg) */ -__msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8_w=0") +__msg("2: (7a) *(u64 *)(r10 -8) = 0 ; R10=fp0 fp-8=0") /* but fp-16 is spilled IMPRECISE zero const reg */ -__msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=0 R10=fp0 fp-16_w=0") +__msg("4: (7b) *(u64 *)(r10 -16) = r0 ; R0=0 R10=fp0 fp-16=0") /* validate that assigning R2 from STACK_SPILL with zero value doesn't mark register * precise immediately; if necessary, it will be marked precise later */ -__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8_w=0") +__msg("6: (71) r2 = *(u8 *)(r10 -1) ; R2=0 R10=fp0 fp-8=0") /* similarly, when R2 is assigned from spilled register, it is initially * imprecise, but will be marked precise later once it is used in precise context */ -__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2_w=0 R10=fp0 fp-16_w=0") +__msg("10: (71) r2 = *(u8 *)(r10 -9) ; R2=0 R10=fp0 fp-16=0") __msg("11: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 11 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 10: (71) r2 = *(u8 *)(r10 -9)") @@ -598,7 +598,7 @@ __log_level(2) __success /* fp-4 is STACK_ZERO */ __msg("2: (62) *(u32 *)(r10 -4) = 0 ; R10=fp0 fp-8=0000????") -__msg("4: (71) r2 = *(u8 *)(r10 -1) ; R2_w=0 R10=fp0 fp-8=0000????") +__msg("4: (71) r2 = *(u8 *)(r10 -1) ; R2=0 R10=fp0 fp-8=0000????") __msg("5: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 5 first_idx 0 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 4: (71) r2 = *(u8 *)(r10 -1)") @@ -640,25 +640,25 @@ SEC("raw_tp") __log_level(2) __flag(BPF_F_TEST_STATE_FREQ) __success /* make sure fp-8 is IMPRECISE fake register spill */ -__msg("3: (7a) *(u64 *)(r10 -8) = 1 ; R10=fp0 fp-8_w=1") +__msg("3: (7a) *(u64 *)(r10 -8) = 1 ; R10=fp0 fp-8=1") /* and fp-16 is spilled IMPRECISE const reg */ -__msg("5: (7b) *(u64 *)(r10 -16) = r0 ; R0_w=1 R10=fp0 fp-16_w=1") +__msg("5: (7b) *(u64 *)(r10 -16) = r0 ; R0=1 R10=fp0 fp-16=1") /* validate load from fp-8, which was initialized using BPF_ST_MEM */ -__msg("8: (79) r2 = *(u64 *)(r10 -8) ; R2_w=1 R10=fp0 fp-8=1") +__msg("8: (79) r2 = *(u64 *)(r10 -8) ; R2=1 R10=fp0 fp-8=1") __msg("9: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 8: (79) r2 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6") /* note, fp-8 is precise, fp-16 is not yet precise, we'll get there */ -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_rw=P1 fp-16_w=1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (7a) *(u64 *)(r10 -8) = 1") -__msg("10: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ -__msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2_w=1 R10=fp0 fp-16=1") +__msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2=1 R10=fp0 fp-16=1") __msg("13: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 13 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 12: (79) r2 = *(u64 *)(r10 -16)") @@ -668,12 +668,12 @@ __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2") __msg("mark_precise: frame0: regs= stack=-16 before 8: (79) r2 = *(u64 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6") /* now both fp-8 and fp-16 are precise, very good */ -__msg("mark_precise: frame0: parent state regs= stack=-16: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_rw=P1 fp-16_rw=P1") +__msg("mark_precise: frame0: parent state regs= stack=-16: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=P1 fp-16=P1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") __naked void stack_load_preserves_const_precision(void) { asm volatile ( @@ -719,22 +719,22 @@ __success /* make sure fp-8 is 32-bit FAKE subregister spill */ __msg("3: (62) *(u32 *)(r10 -8) = 1 ; R10=fp0 fp-8=????1") /* but fp-16 is spilled IMPRECISE zero const reg */ -__msg("5: (63) *(u32 *)(r10 -16) = r0 ; R0_w=1 R10=fp0 fp-16=????1") +__msg("5: (63) *(u32 *)(r10 -16) = r0 ; R0=1 R10=fp0 fp-16=????1") /* validate load from fp-8, which was initialized using BPF_ST_MEM */ -__msg("8: (61) r2 = *(u32 *)(r10 -8) ; R2_w=1 R10=fp0 fp-8=????1") +__msg("8: (61) r2 = *(u32 *)(r10 -8) ; R2=1 R10=fp0 fp-8=????1") __msg("9: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 9 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 8: (61) r2 = *(u32 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-8 before 7: (bf) r1 = r6") -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_r=????P1 fp-16=????1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (62) *(u32 *)(r10 -8) = 1") -__msg("10: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ -__msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2_w=1 R10=fp0 fp-16=????1") +__msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2=1 R10=fp0 fp-16=????1") __msg("13: (0f) r1 += r2") __msg("mark_precise: frame0: last_idx 13 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r2 stack= before 12: (61) r2 = *(u32 *)(r10 -16)") @@ -743,12 +743,12 @@ __msg("mark_precise: frame0: regs= stack=-16 before 10: (73) *(u8 *)(r1 +0) = r2 __msg("mark_precise: frame0: regs= stack=-16 before 9: (0f) r1 += r2") __msg("mark_precise: frame0: regs= stack=-16 before 8: (61) r2 = *(u32 *)(r10 -8)") __msg("mark_precise: frame0: regs= stack=-16 before 7: (bf) r1 = r6") -__msg("mark_precise: frame0: parent state regs= stack=-16: R0_w=1 R1=ctx() R6_r=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8_r=????P1 fp-16_r=????P1") +__msg("mark_precise: frame0: parent state regs= stack=-16: R0=1 R1=ctx() R6=map_value(map=.data.two_byte_,ks=4,vs=2) R10=fp0 fp-8=????P1 fp-16=????P1") __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1_w=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2_w=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") __naked void stack_load_preserves_const_precision_subreg(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c index 9d415f7ce599..ac3e418c2a96 100644 --- a/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_subprog_precision.c @@ -105,7 +105,7 @@ __msg("mark_precise: frame0: regs=r0 stack= before 4: (27) r0 *= 4") __msg("mark_precise: frame0: regs=r0 stack= before 3: (57) r0 &= 3") __msg("mark_precise: frame0: regs=r0 stack= before 10: (95) exit") __msg("mark_precise: frame1: regs=r0 stack= before 9: (bf) r0 = (s8)r10") -__msg("7: R0_w=scalar") +__msg("7: R0=scalar") __naked int fp_precise_subprog_result(void) { asm volatile ( @@ -141,7 +141,7 @@ __msg("mark_precise: frame1: regs=r0 stack= before 10: (bf) r0 = (s8)r1") * anyways, at which point we'll break precision chain */ __msg("mark_precise: frame1: regs=r1 stack= before 9: (bf) r1 = r10") -__msg("7: R0_w=scalar") +__msg("7: R0=scalar") __naked int sneaky_fp_precise_subprog_result(void) { asm volatile ( @@ -681,7 +681,7 @@ __msg("mark_precise: frame0: last_idx 10 first_idx 7 subseq_idx -1") __msg("mark_precise: frame0: regs=r7 stack= before 9: (bf) r1 = r8") __msg("mark_precise: frame0: regs=r7 stack= before 8: (27) r7 *= 4") __msg("mark_precise: frame0: regs=r7 stack= before 7: (79) r7 = *(u64 *)(r10 -8)") -__msg("mark_precise: frame0: parent state regs= stack=-8: R0_w=2 R6_w=1 R8_rw=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8_rw=P1") +__msg("mark_precise: frame0: parent state regs= stack=-8: R0=2 R6=1 R8=map_value(map=.data.vals,ks=4,vs=16) R10=fp0 fp-8=P1") __msg("mark_precise: frame0: last_idx 18 first_idx 0 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-8 before 18: (95) exit") __msg("mark_precise: frame1: regs= stack= before 17: (0f) r0 += r2") diff --git a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c index b616575c3b00..ce13002c7a19 100644 --- a/tools/testing/selftests/bpf/verifier/bpf_st_mem.c +++ b/tools/testing/selftests/bpf/verifier/bpf_st_mem.c @@ -93,7 +93,7 @@ .expected_attach_type = BPF_SK_LOOKUP, .result = VERBOSE_ACCEPT, .runs = -1, - .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8_w=-44\ + .errstr = "0: (7a) *(u64 *)(r10 -8) = -44 ; R10=fp0 fp-8=-44\ 2: (c5) if r0 s< 0x0 goto pc+2\ - R0_w=-44", + R0=-44", }, -- cgit v1.2.3 From 79f047c7d968b21ff4b72bd70c4533140553c56c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Sep 2025 19:18:43 -0700 Subject: bpf: table based bpf_insn_successors() Converting bpf_insn_successors() to use lookup table makes it ~1.5 times faster. Also remove unnecessary conditionals: - `idx + 1 < prog->len` is unnecessary because after check_cfg() all jump targets are guaranteed to be within a program; - `i == 0 || succ[0] != dst` is unnecessary because any client of bpf_insn_successors() can handle duplicate edges: - compute_live_registers() - compute_scc() Moving bpf_insn_successors() to liveness.c allows its inlining in liveness.c:__update_stack_liveness(). Such inlining speeds up __update_stack_liveness() by ~40%. bpf_insn_successors() is used in both verifier.c and liveness.c. perf shows such move does not negatively impact users in verifier.c, as these are executed only once before main varification pass. Unlike __update_stack_liveness() which can be triggered multiple times. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250918-callchain-sensitive-liveness-v3-10-c3cd27bacc60@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/liveness.c | 56 ++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 72 +------------------------------------------- 3 files changed, 58 insertions(+), 71 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c7515da8500c..4c497e839526 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -1049,6 +1049,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st u32 frameno); struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); +int bpf_jmp_offset(struct bpf_insn *insn); int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); diff --git a/kernel/bpf/liveness.c b/kernel/bpf/liveness.c index 6f9dfaaf6e64..3c611aba7f52 100644 --- a/kernel/bpf/liveness.c +++ b/kernel/bpf/liveness.c @@ -433,6 +433,62 @@ static void log_mask_change(struct bpf_verifier_env *env, struct callchain *call bpf_log(&env->log, "\n"); } +int bpf_jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + +__diag_push(); +__diag_ignore_all("-Woverride-init", "Allow field initialization overrides for opcode_info_tbl"); + +inline int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +{ + static const struct opcode_info { + bool can_jump; + bool can_fallthrough; + } opcode_info_tbl[256] = { + [0 ... 255] = {.can_jump = false, .can_fallthrough = true}, + #define _J(code, ...) \ + [BPF_JMP | code] = __VA_ARGS__, \ + [BPF_JMP32 | code] = __VA_ARGS__ + + _J(BPF_EXIT, {.can_jump = false, .can_fallthrough = false}), + _J(BPF_JA, {.can_jump = true, .can_fallthrough = false}), + _J(BPF_JEQ, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JNE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JLT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JLE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JGT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JGE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSGT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSGE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSLT, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSLE, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JCOND, {.can_jump = true, .can_fallthrough = true}), + _J(BPF_JSET, {.can_jump = true, .can_fallthrough = true}), + #undef _J + }; + struct bpf_insn *insn = &prog->insnsi[idx]; + const struct opcode_info *opcode_info; + int i = 0, insn_sz; + + opcode_info = &opcode_info_tbl[BPF_CLASS(insn->code) | BPF_OP(insn->code)]; + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (opcode_info->can_fallthrough) + succ[i++] = idx + insn_sz; + + if (opcode_info->can_jump) + succ[i++] = idx + bpf_jmp_offset(insn) + 1; + + return i; +} + +__diag_pop(); + static struct func_instance *get_outer_instance(struct bpf_verifier_env *env, struct func_instance *instance) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e1da2471442b..1d4183bc3cd1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3485,15 +3485,6 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } -static int jmp_offset(struct bpf_insn *insn) -{ - u8 code = insn->code; - - if (code == (BPF_JMP32 | BPF_JA)) - return insn->imm; - return insn->off; -} - static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3520,7 +3511,7 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - off = i + jmp_offset(&insn[i]) + 1; + off = i + bpf_jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -23944,67 +23935,6 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } -static bool can_fallthrough(struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - - if (class != BPF_JMP && class != BPF_JMP32) - return true; - - if (opcode == BPF_EXIT || opcode == BPF_JA) - return false; - - return true; -} - -static bool can_jump(struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - u8 opcode = BPF_OP(insn->code); - - if (class != BPF_JMP && class != BPF_JMP32) - return false; - - switch (opcode) { - case BPF_JA: - case BPF_JEQ: - case BPF_JNE: - case BPF_JLT: - case BPF_JLE: - case BPF_JGT: - case BPF_JGE: - case BPF_JSGT: - case BPF_JSGE: - case BPF_JSLT: - case BPF_JSLE: - case BPF_JCOND: - case BPF_JSET: - return true; - } - - return false; -} - -int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) -{ - struct bpf_insn *insn = &prog->insnsi[idx]; - int i = 0, insn_sz; - u32 dst; - - insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; - if (can_fallthrough(insn) && idx + 1 < prog->len) - succ[i++] = idx + insn_sz; - - if (can_jump(insn)) { - dst = idx + jmp_offset(insn) + 1; - if (i == 0 || succ[0] != dst) - succ[i++] = dst; - } - - return i; -} - /* Each field is a register bitmask */ struct insn_live_regs { u16 use; /* registers read by instruction */ -- cgit v1.2.3 From 32a8d2a197c1d2d36badb401657aa193938f071c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 17 Sep 2025 16:12:01 +0100 Subject: net: stmmac: rework mac_interface and phy_interface documentation Based on new research, it has come to light that the comment that I added in a014c35556b9 ("net: stmmac: clarify difference between "interface" and "phy_interface"") is not fully correct. Update the comment to properly describe the difference between the two. All of the DTS files in the kernel tree do not mention the "mac-mode" property, which results in mac_interface and phy_interface being the same. Also, none of the platform glue drivers set mac_interface to anything but PHY_INTERFACE_MODE_NA. This means that for all the platforms known to mainline, mac_interface is either the same as phy_interface, or it is PHY_INTERFACE_MODE_NA. Thus, updating the definition for mac_interface in stmmac.h has no material effect on current uses known to mainline, but the change opens the door to cleaning up all uses. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uytpB-00000006H23-0pRi@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index e284f04964bf..f14f34ec6d5e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -190,18 +190,32 @@ struct plat_stmmacenet_data { int bus_id; int phy_addr; /* MAC ----- optional PCS ----- SerDes ----- optional PHY ----- Media - * ^ ^ - * mac_interface phy_interface + * ^ ^ + * mac_interface phy_interface * - * mac_interface is the MAC-side interface, which may be the same - * as phy_interface if there is no intervening PCS. If there is a - * PCS, then mac_interface describes the interface mode between the - * MAC and PCS, and phy_interface describes the interface mode - * between the PCS and PHY. + * The Synopsys dwmac core only covers the MAC and an optional + * integrated PCS. Where the integrated PCS is used with a SerDes, + * e.g. for 1000base-X or Cisco SGMII, the connection between the + * PCS and SerDes will be TBI. + * + * Where the Synopsys dwmac core has been instantiated with multiple + * interface modes, these are selected via core-external configuration + * which is sampled when the dwmac core is reset. How this is done is + * platform glue specific, but this defines the interface used from + * the Synopsys dwmac core to the rest of the SoC. + * + * Where PCS other than the optional integrated Synopsys dwmac PCS + * is used, this counts as "the rest of the SoC" in the above + * paragraph. + * + * Thus, mac_interface is of little use inside the stmmac code; + * please do not use unless there is a definite requirement, and + * make sure to gain review feedback first. */ phy_interface_t mac_interface; /* phy_interface is the PHY-side interface - the interface used by - * an attached PHY. + * an attached PHY or SFP etc. This is equivalent to the interface + * that phylink uses. */ phy_interface_t phy_interface; struct stmmac_mdio_bus_data *mdio_bus_data; -- cgit v1.2.3 From 6b0ed6a3a89cd2d04980e15a44c645bebb077418 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 17 Sep 2025 16:12:47 +0100 Subject: net: stmmac: remove mac_interface mac_interface has served little purpose, and has only caused confusion. Now that we have cleaned up all platform glue drivers which should not have been using mac_interface, there are no users remaining. Remove mac_interface. This results in the special dwmac specific "mac-mode" DT property becoming redundant, and an in case, no DTS files in the kernel make use of this property. Add a warning if the property is set, and it is different from the "phy-mode". Signed-off-by: Russell King (Oracle) Acked-by: Vladimir Zapolskiy Link: https://patch.msgid.link/E1uytpv-00000006H2x-196h@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 -- drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c | 1 - drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 +---- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 6 +++++- include/linux/stmmac.h | 11 +++-------- 5 files changed, 9 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index dd82dc2189e9..592aa9d636e5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -98,8 +98,6 @@ static void loongson_default_data(struct pci_dev *pdev, /* Set default value for multicast hash bins */ plat->multicast_filter_bins = 256; - plat->mac_interface = PHY_INTERFACE_MODE_NA; - /* Set default value for unicast filter entries */ plat->unicast_filter_entries = 1; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c index c0c44916f849..2562a6d036a2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-lpc18xx.c @@ -41,7 +41,6 @@ static int lpc18xx_dwmac_probe(struct platform_device *pdev) if (IS_ERR(plat_dat)) return PTR_ERR(plat_dat); - plat_dat->mac_interface = PHY_INTERFACE_MODE_NA; plat_dat->has_gmac = true; reg = syscon_regmap_lookup_by_compatible("nxp,lpc1850-creg"); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index a23017a886f3..d17820d9e7f1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1118,10 +1118,7 @@ static const struct phylink_mac_ops stmmac_phylink_mac_ops = { */ static void stmmac_check_pcs_mode(struct stmmac_priv *priv) { - int interface = priv->plat->mac_interface; - - if (interface == PHY_INTERFACE_MODE_NA) - interface = priv->plat->phy_interface; + int interface = priv->plat->phy_interface; if (priv->dma_cap.pcs) { if ((interface == PHY_INTERFACE_MODE_RGMII) || diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index a3e077f225d1..712ef235f0f4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -453,8 +453,12 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) return ERR_PTR(phy_mode); plat->phy_interface = phy_mode; + rc = stmmac_of_get_mac_mode(np); - plat->mac_interface = rc < 0 ? plat->phy_interface : rc; + if (rc >= 0 && rc != phy_mode) + dev_warn(&pdev->dev, + "\"mac-mode\" property used for %s but differs to \"phy-mode\" of %s, and will be ignored. Please report.\n", + phy_modes(rc), phy_modes(phy_mode)); /* Some wrapper drivers still rely on phy_node. Let's save it while * they are not converted to phylink. */ diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index f14f34ec6d5e..fa1318bac06c 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -190,8 +190,8 @@ struct plat_stmmacenet_data { int bus_id; int phy_addr; /* MAC ----- optional PCS ----- SerDes ----- optional PHY ----- Media - * ^ ^ - * mac_interface phy_interface + * ^ + * phy_interface * * The Synopsys dwmac core only covers the MAC and an optional * integrated PCS. Where the integrated PCS is used with a SerDes, @@ -208,12 +208,7 @@ struct plat_stmmacenet_data { * is used, this counts as "the rest of the SoC" in the above * paragraph. * - * Thus, mac_interface is of little use inside the stmmac code; - * please do not use unless there is a definite requirement, and - * make sure to gain review feedback first. - */ - phy_interface_t mac_interface; - /* phy_interface is the PHY-side interface - the interface used by + * phy_interface is the PHY-side interface - the interface used by * an attached PHY or SFP etc. This is equivalent to the interface * that phylink uses. */ -- cgit v1.2.3 From b34df17d588de926212527a2f2ce72bc4e330260 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 18 Sep 2025 05:25:57 -0700 Subject: net: netpoll: remove unused netpoll pointer from netpoll_info The netpoll_info structure contains an useless pointer back to its associated netpoll. This field is never used, and the assignment in __netpoll_setup() is does not comtemplate multiple instances, as reported by Jay[1]. Drop both the member and its initialization to simplify the structure. Link: https://lore.kernel.org/all/2930648.1757463506@famine/ [1] Signed-off-by: Breno Leitao Link: https://patch.msgid.link/20250918-netpoll_jv-v1-1-67d50eeb2c26@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netpoll.h | 1 - net/core/netpoll.c | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index b5ea9882eda8..f22eec466040 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -55,7 +55,6 @@ struct netpoll_info { struct delayed_work tx_work; - struct netpoll *netpoll; struct rcu_head rcu; }; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5f65b62346d4..c58faa747165 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -591,7 +591,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) np->dev = ndev; strscpy(np->dev_name, ndev->name, IFNAMSIZ); - npinfo->netpoll = np; /* fill up the skb queue */ refill_skbs(np); -- cgit v1.2.3 From bee8a520eb84950193d0566ea2c2e46406a4b6ce Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 9 Sep 2025 17:50:56 +0800 Subject: rhashtable: Use rcu_dereference_all and rcu_dereference_all_check Add rcu_dereference_all and rcu_dereference_all_check so that library code such as rhashtable can be used with any RCU variant. As it stands rcu_dereference is used within rashtable, which creates false-positive warnings if the user calls it from another RCU context, such as preempt_disable(). Use the rcu_dereference_all and rcu_dereference_all_check calls in rhashtable to suppress these warnings. Also replace the rcu_dereference_raw calls in the list iterators with rcu_dereference_all to uncover buggy calls. Reported-by: Menglong Dong Signed-off-by: Herbert Xu Reviewed-by: Paul E. McKenney Signed-off-by: Herbert Xu --- include/linux/rcupdate.h | 26 ++++++++++++++++++++++++++ include/linux/rhashtable.h | 14 +++++++------- 2 files changed, 33 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb..448eb1f0cb48 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -713,6 +713,24 @@ do { \ (c) || rcu_read_lock_sched_held(), \ __rcu) +/** + * rcu_dereference_all_check() - rcu_dereference_all with debug checking + * @p: The pointer to read, prior to dereferencing + * @c: The conditions under which the dereference will take place + * + * This is similar to rcu_dereference_check(), but allows protection + * by all forms of vanilla RCU readers, including preemption disabled, + * bh-disabled, and interrupt-disabled regions of code. Note that "vanilla + * RCU" excludes SRCU and the various Tasks RCU flavors. Please note + * that this macro should not be backported to any Linux-kernel version + * preceding v5.0 due to changes in synchronize_rcu() semantics prior + * to that version. + */ +#define rcu_dereference_all_check(p, c) \ + __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ + (c) || rcu_read_lock_any_held(), \ + __rcu) + /* * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. @@ -767,6 +785,14 @@ do { \ */ #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) +/** + * rcu_dereference_all() - fetch RCU-all-protected pointer for dereferencing + * @p: The pointer to read, prior to dereferencing + * + * Makes rcu_dereference_check() do the dirty work. + */ +#define rcu_dereference_all(p) rcu_dereference_all_check(p, 0) + /** * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism * @p: The pointer to hand off diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index e740157f3cd7..05a221ce79a6 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -272,13 +272,13 @@ struct rhash_lock_head __rcu **rht_bucket_nested_insert( rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_rcu(p, ht) \ - rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht)) + rcu_dereference_all_check(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_bucket(p, tbl, hash) \ rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_dereference_bucket_rcu(p, tbl, hash) \ - rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash)) + rcu_dereference_all_check(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) @@ -373,7 +373,7 @@ static inline struct rhash_head *__rht_ptr( static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { - return __rht_ptr(rcu_dereference(*bkt), bkt); + return __rht_ptr(rcu_dereference_all(*bkt), bkt); } static inline struct rhash_head *rht_ptr( @@ -497,7 +497,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, for (({barrier(); }), \ pos = head; \ !rht_is_a_nulls(pos); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_all(pos->next)) /** * rht_for_each_rcu - iterate over rcu hash chain @@ -513,7 +513,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, for (({barrier(); }), \ pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ !rht_is_a_nulls(pos); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_all(pos->next)) /** * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head @@ -560,7 +560,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, * list returned by rhltable_lookup. */ #define rhl_for_each_rcu(pos, list) \ - for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) + for (pos = list; pos; pos = rcu_dereference_all(pos->next)) /** * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type @@ -574,7 +574,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, */ #define rhl_for_each_entry_rcu(tpos, pos, list, member) \ for (pos = list; pos && rht_entry(tpos, pos, member); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_all(pos->next)) static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) -- cgit v1.2.3 From 3d716c51e0e8791f8dd72479a3e6d5e7650ac35e Mon Sep 17 00:00:00 2001 From: Weili Qian Date: Sat, 13 Sep 2025 18:57:51 +0800 Subject: crypto: hisilicon/qm - mask axi error before memory init After the device memory is cleared, if the software sends the doorbell operation, the hardware may trigger a axi error when processing the doorbell. This error is caused by memory clearing and hardware access to address 0. Therefore, the axi error is masked during this period. Signed-off-by: Weili Qian Signed-off-by: Chenghai Huang Signed-off-by: Herbert Xu --- drivers/crypto/hisilicon/hpre/hpre_main.c | 100 +++++++++++++++++++---------- drivers/crypto/hisilicon/qm.c | 66 +++++++++++++------ drivers/crypto/hisilicon/sec2/sec_main.c | 90 ++++++++++++++++++-------- drivers/crypto/hisilicon/zip/zip_main.c | 102 ++++++++++++++++++++---------- include/linux/hisi_acc_qm.h | 21 ++++-- 5 files changed, 257 insertions(+), 122 deletions(-) (limited to 'include/linux') diff --git a/drivers/crypto/hisilicon/hpre/hpre_main.c b/drivers/crypto/hisilicon/hpre/hpre_main.c index f437f361a2c9..718abe3fa5fe 100644 --- a/drivers/crypto/hisilicon/hpre/hpre_main.c +++ b/drivers/crypto/hisilicon/hpre/hpre_main.c @@ -39,6 +39,7 @@ #define HPRE_HAC_RAS_NFE_ENB 0x301414 #define HPRE_HAC_RAS_FE_ENB 0x301418 #define HPRE_HAC_INT_SET 0x301500 +#define HPRE_AXI_ERROR_MASK GENMASK(21, 10) #define HPRE_RNG_TIMEOUT_NUM 0x301A34 #define HPRE_CORE_INT_ENABLE 0 #define HPRE_RDCHN_INI_ST 0x301a00 @@ -798,8 +799,7 @@ static void hpre_master_ooo_ctrl(struct hisi_qm *qm, bool enable) val1 = readl(qm->io_base + HPRE_AM_OOO_SHUTDOWN_ENB); if (enable) { val1 |= HPRE_AM_OOO_SHUTDOWN_ENABLE; - val2 = hisi_qm_get_hw_info(qm, hpre_basic_info, - HPRE_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + val2 = qm->err_info.dev_err.shutdown_mask; } else { val1 &= ~HPRE_AM_OOO_SHUTDOWN_ENABLE; val2 = 0x0; @@ -813,38 +813,33 @@ static void hpre_master_ooo_ctrl(struct hisi_qm *qm, bool enable) static void hpre_hw_error_disable(struct hisi_qm *qm) { - u32 ce, nfe; - - ce = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_CE_MASK_CAP, qm->cap_ver); - nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver); + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; /* disable hpre hw error interrupts */ - writel(ce | nfe | HPRE_HAC_RAS_FE_ENABLE, qm->io_base + HPRE_INT_MASK); + writel(err_mask, qm->io_base + HPRE_INT_MASK); /* disable HPRE block master OOO when nfe occurs on Kunpeng930 */ hpre_master_ooo_ctrl(qm, false); } static void hpre_hw_error_enable(struct hisi_qm *qm) { - u32 ce, nfe, err_en; - - ce = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_CE_MASK_CAP, qm->cap_ver); - nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver); + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; /* clear HPRE hw error source if having */ - writel(ce | nfe | HPRE_HAC_RAS_FE_ENABLE, qm->io_base + HPRE_HAC_SOURCE_INT); + writel(err_mask, qm->io_base + HPRE_HAC_SOURCE_INT); /* configure error type */ - writel(ce, qm->io_base + HPRE_RAS_CE_ENB); - writel(nfe, qm->io_base + HPRE_RAS_NFE_ENB); - writel(HPRE_HAC_RAS_FE_ENABLE, qm->io_base + HPRE_RAS_FE_ENB); + writel(dev_err->ce, qm->io_base + HPRE_RAS_CE_ENB); + writel(dev_err->nfe, qm->io_base + HPRE_RAS_NFE_ENB); + writel(dev_err->fe, qm->io_base + HPRE_RAS_FE_ENB); /* enable HPRE block master OOO when nfe occurs on Kunpeng930 */ hpre_master_ooo_ctrl(qm, true); /* enable hpre hw error interrupts */ - err_en = ce | nfe | HPRE_HAC_RAS_FE_ENABLE; - writel(~err_en, qm->io_base + HPRE_INT_MASK); + writel(~err_mask, qm->io_base + HPRE_INT_MASK); } static inline struct hisi_qm *hpre_file_to_qm(struct hpre_debugfs_file *file) @@ -1399,9 +1394,8 @@ static void hpre_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts) static void hpre_disable_error_report(struct hisi_qm *qm, u32 err_type) { - u32 nfe_mask; + u32 nfe_mask = qm->err_info.dev_err.nfe; - nfe_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver); writel(nfe_mask & (~err_type), qm->io_base + HPRE_RAS_NFE_ENB); } @@ -1422,11 +1416,11 @@ static enum acc_err_result hpre_get_err_result(struct hisi_qm *qm) err_status = hpre_get_hw_err_status(qm); if (err_status) { - if (err_status & qm->err_info.ecc_2bits_mask) + if (err_status & qm->err_info.dev_err.ecc_2bits_mask) qm->err_status.is_dev_ecc_mbit = true; hpre_log_hw_error(qm, err_status); - if (err_status & qm->err_info.dev_reset_mask) { + if (err_status & qm->err_info.dev_err.reset_mask) { /* Disable the same error reporting until device is recovered. */ hpre_disable_error_report(qm, err_status); return ACC_ERR_NEED_RESET; @@ -1442,28 +1436,64 @@ static bool hpre_dev_is_abnormal(struct hisi_qm *qm) u32 err_status; err_status = hpre_get_hw_err_status(qm); - if (err_status & qm->err_info.dev_shutdown_mask) + if (err_status & qm->err_info.dev_err.shutdown_mask) return true; return false; } +static void hpre_disable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + u32 val; + + val = ~(err_mask & (~HPRE_AXI_ERROR_MASK)); + writel(val, qm->io_base + HPRE_INT_MASK); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask & (~HPRE_AXI_ERROR_MASK), + qm->io_base + HPRE_OOO_SHUTDOWN_SEL); +} + +static void hpre_enable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + + /* clear axi error source */ + writel(HPRE_AXI_ERROR_MASK, qm->io_base + HPRE_HAC_SOURCE_INT); + + writel(~err_mask, qm->io_base + HPRE_INT_MASK); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask, qm->io_base + HPRE_OOO_SHUTDOWN_SEL); +} + static void hpre_err_info_init(struct hisi_qm *qm) { struct hisi_qm_err_info *err_info = &qm->err_info; + struct hisi_qm_err_mask *qm_err = &err_info->qm_err; + struct hisi_qm_err_mask *dev_err = &err_info->dev_err; + + qm_err->fe = HPRE_HAC_RAS_FE_ENABLE; + qm_err->ce = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_QM_CE_MASK_CAP, qm->cap_ver); + qm_err->nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_QM_NFE_MASK_CAP, qm->cap_ver); + qm_err->shutdown_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, + HPRE_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + qm_err->reset_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, + HPRE_QM_RESET_MASK_CAP, qm->cap_ver); + qm_err->ecc_2bits_mask = QM_ECC_MBIT; + + dev_err->fe = HPRE_HAC_RAS_FE_ENABLE; + dev_err->ce = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_CE_MASK_CAP, qm->cap_ver); + dev_err->nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_NFE_MASK_CAP, qm->cap_ver); + dev_err->shutdown_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, + HPRE_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + dev_err->reset_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, + HPRE_RESET_MASK_CAP, qm->cap_ver); + dev_err->ecc_2bits_mask = HPRE_CORE_ECC_2BIT_ERR | HPRE_OOO_ECC_2BIT_ERR; - err_info->fe = HPRE_HAC_RAS_FE_ENABLE; - err_info->ce = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_QM_CE_MASK_CAP, qm->cap_ver); - err_info->nfe = hisi_qm_get_hw_info(qm, hpre_basic_info, HPRE_QM_NFE_MASK_CAP, qm->cap_ver); - err_info->ecc_2bits_mask = HPRE_CORE_ECC_2BIT_ERR | HPRE_OOO_ECC_2BIT_ERR; - err_info->dev_shutdown_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, - HPRE_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->qm_shutdown_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, - HPRE_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->qm_reset_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, - HPRE_QM_RESET_MASK_CAP, qm->cap_ver); - err_info->dev_reset_mask = hisi_qm_get_hw_info(qm, hpre_basic_info, - HPRE_RESET_MASK_CAP, qm->cap_ver); err_info->msi_wr_port = HPRE_WR_MSI_PORT; err_info->acpi_rst = "HRST"; } @@ -1481,6 +1511,8 @@ static const struct hisi_qm_err_ini hpre_err_ini = { .err_info_init = hpre_err_info_init, .get_err_result = hpre_get_err_result, .dev_is_abnormal = hpre_dev_is_abnormal, + .disable_axi_error = hpre_disable_axi_error, + .enable_axi_error = hpre_enable_axi_error, }; static int hpre_pf_probe_init(struct hpre *hpre) diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c index c1ffaae78532..d0d1fc45b8fe 100644 --- a/drivers/crypto/hisilicon/qm.c +++ b/drivers/crypto/hisilicon/qm.c @@ -147,9 +147,9 @@ #define QM_RAS_CE_TIMES_PER_IRQ 1 #define QM_OOO_SHUTDOWN_SEL 0x1040f8 #define QM_AXI_RRESP_ERR BIT(0) -#define QM_ECC_MBIT BIT(2) #define QM_DB_TIMEOUT BIT(10) #define QM_OF_FIFO_OF BIT(11) +#define QM_RAS_AXI_ERROR (BIT(0) | BIT(1) | BIT(12)) #define QM_RESET_WAIT_TIMEOUT 400 #define QM_PEH_VENDOR_ID 0x1000d8 @@ -165,7 +165,6 @@ #define ACC_MASTER_TRANS_RETURN 0x300150 #define ACC_MASTER_GLOBAL_CTRL 0x300000 #define ACC_AM_CFG_PORT_WR_EN 0x30001c -#define QM_RAS_NFE_MBIT_DISABLE ~QM_ECC_MBIT #define ACC_AM_ROB_ECC_INT_STS 0x300104 #define ACC_ROB_ECC_ERR_MULTPL BIT(1) #define QM_MSI_CAP_ENABLE BIT(16) @@ -522,7 +521,7 @@ static bool qm_check_dev_error(struct hisi_qm *qm) return false; err_status = qm_get_hw_error_status(pf_qm); - if (err_status & pf_qm->err_info.qm_shutdown_mask) + if (err_status & pf_qm->err_info.qm_err.shutdown_mask) return true; if (pf_qm->err_ini->dev_is_abnormal) @@ -1397,17 +1396,17 @@ static void qm_hw_error_init_v1(struct hisi_qm *qm) static void qm_hw_error_cfg(struct hisi_qm *qm) { - struct hisi_qm_err_info *err_info = &qm->err_info; + struct hisi_qm_err_mask *qm_err = &qm->err_info.qm_err; - qm->error_mask = err_info->nfe | err_info->ce | err_info->fe; + qm->error_mask = qm_err->nfe | qm_err->ce | qm_err->fe; /* clear QM hw residual error source */ writel(qm->error_mask, qm->io_base + QM_ABNORMAL_INT_SOURCE); /* configure error type */ - writel(err_info->ce, qm->io_base + QM_RAS_CE_ENABLE); + writel(qm_err->ce, qm->io_base + QM_RAS_CE_ENABLE); writel(QM_RAS_CE_TIMES_PER_IRQ, qm->io_base + QM_RAS_CE_THRESHOLD); - writel(err_info->nfe, qm->io_base + QM_RAS_NFE_ENABLE); - writel(err_info->fe, qm->io_base + QM_RAS_FE_ENABLE); + writel(qm_err->nfe, qm->io_base + QM_RAS_NFE_ENABLE); + writel(qm_err->fe, qm->io_base + QM_RAS_FE_ENABLE); } static void qm_hw_error_init_v2(struct hisi_qm *qm) @@ -1436,7 +1435,7 @@ static void qm_hw_error_init_v3(struct hisi_qm *qm) qm_hw_error_cfg(qm); /* enable close master ooo when hardware error happened */ - writel(qm->err_info.qm_shutdown_mask, qm->io_base + QM_OOO_SHUTDOWN_SEL); + writel(qm->err_info.qm_err.shutdown_mask, qm->io_base + QM_OOO_SHUTDOWN_SEL); irq_unmask = ~qm->error_mask; irq_unmask &= readl(qm->io_base + QM_ABNORMAL_INT_MASK); @@ -1498,6 +1497,7 @@ static void qm_log_hw_error(struct hisi_qm *qm, u32 error_status) static enum acc_err_result qm_hw_error_handle_v2(struct hisi_qm *qm) { + struct hisi_qm_err_mask *qm_err = &qm->err_info.qm_err; u32 error_status; error_status = qm_get_hw_error_status(qm); @@ -1506,17 +1506,16 @@ static enum acc_err_result qm_hw_error_handle_v2(struct hisi_qm *qm) qm->err_status.is_qm_ecc_mbit = true; qm_log_hw_error(qm, error_status); - if (error_status & qm->err_info.qm_reset_mask) { + if (error_status & qm_err->reset_mask) { /* Disable the same error reporting until device is recovered. */ - writel(qm->err_info.nfe & (~error_status), - qm->io_base + QM_RAS_NFE_ENABLE); + writel(qm_err->nfe & (~error_status), qm->io_base + QM_RAS_NFE_ENABLE); return ACC_ERR_NEED_RESET; } /* Clear error source if not need reset. */ writel(error_status, qm->io_base + QM_ABNORMAL_INT_SOURCE); - writel(qm->err_info.nfe, qm->io_base + QM_RAS_NFE_ENABLE); - writel(qm->err_info.ce, qm->io_base + QM_RAS_CE_ENABLE); + writel(qm_err->nfe, qm->io_base + QM_RAS_NFE_ENABLE); + writel(qm_err->ce, qm->io_base + QM_RAS_CE_ENABLE); } return ACC_ERR_RECOVERED; @@ -4227,9 +4226,9 @@ static void qm_dev_ecc_mbit_handle(struct hisi_qm *qm) !qm->err_status.is_qm_ecc_mbit && !qm->err_ini->close_axi_master_ooo) { nfe_enb = readl(qm->io_base + QM_RAS_NFE_ENABLE); - writel(nfe_enb & QM_RAS_NFE_MBIT_DISABLE, + writel(nfe_enb & ~qm->err_info.qm_err.ecc_2bits_mask, qm->io_base + QM_RAS_NFE_ENABLE); - writel(QM_ECC_MBIT, qm->io_base + QM_ABNORMAL_INT_SET); + writel(qm->err_info.qm_err.ecc_2bits_mask, qm->io_base + QM_ABNORMAL_INT_SET); } } @@ -4508,12 +4507,12 @@ static void qm_restart_prepare(struct hisi_qm *qm) qm->io_base + ACC_AM_CFG_PORT_WR_EN); /* clear dev ecc 2bit error source if having */ - value = qm_get_dev_err_status(qm) & qm->err_info.ecc_2bits_mask; + value = qm_get_dev_err_status(qm) & qm->err_info.dev_err.ecc_2bits_mask; if (value && qm->err_ini->clear_dev_hw_err_status) qm->err_ini->clear_dev_hw_err_status(qm, value); /* clear QM ecc mbit error source */ - writel(QM_ECC_MBIT, qm->io_base + QM_ABNORMAL_INT_SOURCE); + writel(qm->err_info.qm_err.ecc_2bits_mask, qm->io_base + QM_ABNORMAL_INT_SOURCE); /* clear AM Reorder Buffer ecc mbit source */ writel(ACC_ROB_ECC_ERR_MULTPL, qm->io_base + ACC_AM_ROB_ECC_INT_STS); @@ -4540,6 +4539,34 @@ clear_flags: qm->err_status.is_dev_ecc_mbit = false; } +static void qm_disable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *qm_err = &qm->err_info.qm_err; + u32 val; + + val = ~(qm->error_mask & (~QM_RAS_AXI_ERROR)); + writel(val, qm->io_base + QM_ABNORMAL_INT_MASK); + if (qm->ver > QM_HW_V2) + writel(qm_err->shutdown_mask & (~QM_RAS_AXI_ERROR), + qm->io_base + QM_OOO_SHUTDOWN_SEL); + + if (qm->err_ini->disable_axi_error) + qm->err_ini->disable_axi_error(qm); +} + +static void qm_enable_axi_error(struct hisi_qm *qm) +{ + /* clear axi error source */ + writel(QM_RAS_AXI_ERROR, qm->io_base + QM_ABNORMAL_INT_SOURCE); + + writel(~qm->error_mask, qm->io_base + QM_ABNORMAL_INT_MASK); + if (qm->ver > QM_HW_V2) + writel(qm->err_info.qm_err.shutdown_mask, qm->io_base + QM_OOO_SHUTDOWN_SEL); + + if (qm->err_ini->enable_axi_error) + qm->err_ini->enable_axi_error(qm); +} + static int qm_controller_reset_done(struct hisi_qm *qm) { struct pci_dev *pdev = qm->pdev; @@ -4573,6 +4600,7 @@ static int qm_controller_reset_done(struct hisi_qm *qm) qm_restart_prepare(qm); hisi_qm_dev_err_init(qm); + qm_disable_axi_error(qm); if (qm->err_ini->open_axi_master_ooo) qm->err_ini->open_axi_master_ooo(qm); @@ -4595,7 +4623,7 @@ static int qm_controller_reset_done(struct hisi_qm *qm) ret = qm_wait_vf_prepare_finish(qm); if (ret) pci_err(pdev, "failed to start by vfs in soft reset!\n"); - + qm_enable_axi_error(qm); qm_cmd_init(qm); qm_restart_done(qm); diff --git a/drivers/crypto/hisilicon/sec2/sec_main.c b/drivers/crypto/hisilicon/sec2/sec_main.c index bdb2d52ee1b6..19fda486fefb 100644 --- a/drivers/crypto/hisilicon/sec2/sec_main.c +++ b/drivers/crypto/hisilicon/sec2/sec_main.c @@ -47,6 +47,8 @@ #define SEC_RAS_FE_ENB_MSK 0x0 #define SEC_OOO_SHUTDOWN_SEL 0x301014 #define SEC_RAS_DISABLE 0x0 +#define SEC_AXI_ERROR_MASK (BIT(0) | BIT(1)) + #define SEC_MEM_START_INIT_REG 0x301100 #define SEC_MEM_INIT_DONE_REG 0x301104 @@ -713,8 +715,7 @@ static void sec_master_ooo_ctrl(struct hisi_qm *qm, bool enable) val1 = readl(qm->io_base + SEC_CONTROL_REG); if (enable) { val1 |= SEC_AXI_SHUTDOWN_ENABLE; - val2 = hisi_qm_get_hw_info(qm, sec_basic_info, - SEC_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + val2 = qm->err_info.dev_err.shutdown_mask; } else { val1 &= SEC_AXI_SHUTDOWN_DISABLE; val2 = 0x0; @@ -728,7 +729,8 @@ static void sec_master_ooo_ctrl(struct hisi_qm *qm, bool enable) static void sec_hw_error_enable(struct hisi_qm *qm) { - u32 ce, nfe; + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; if (qm->ver == QM_HW_V1) { writel(SEC_CORE_INT_DISABLE, qm->io_base + SEC_CORE_INT_MASK); @@ -736,22 +738,19 @@ static void sec_hw_error_enable(struct hisi_qm *qm) return; } - ce = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_CE_MASK_CAP, qm->cap_ver); - nfe = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_NFE_MASK_CAP, qm->cap_ver); - /* clear SEC hw error source if having */ - writel(ce | nfe | SEC_RAS_FE_ENB_MSK, qm->io_base + SEC_CORE_INT_SOURCE); + writel(err_mask, qm->io_base + SEC_CORE_INT_SOURCE); /* enable RAS int */ - writel(ce, qm->io_base + SEC_RAS_CE_REG); - writel(SEC_RAS_FE_ENB_MSK, qm->io_base + SEC_RAS_FE_REG); - writel(nfe, qm->io_base + SEC_RAS_NFE_REG); + writel(dev_err->ce, qm->io_base + SEC_RAS_CE_REG); + writel(dev_err->fe, qm->io_base + SEC_RAS_FE_REG); + writel(dev_err->nfe, qm->io_base + SEC_RAS_NFE_REG); /* enable SEC block master OOO when nfe occurs on Kunpeng930 */ sec_master_ooo_ctrl(qm, true); /* enable SEC hw error interrupts */ - writel(ce | nfe | SEC_RAS_FE_ENB_MSK, qm->io_base + SEC_CORE_INT_MASK); + writel(err_mask, qm->io_base + SEC_CORE_INT_MASK); } static void sec_hw_error_disable(struct hisi_qm *qm) @@ -1108,9 +1107,8 @@ static void sec_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts) static void sec_disable_error_report(struct hisi_qm *qm, u32 err_type) { - u32 nfe_mask; + u32 nfe_mask = qm->err_info.dev_err.nfe; - nfe_mask = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_NFE_MASK_CAP, qm->cap_ver); writel(nfe_mask & (~err_type), qm->io_base + SEC_RAS_NFE_REG); } @@ -1129,11 +1127,11 @@ static enum acc_err_result sec_get_err_result(struct hisi_qm *qm) err_status = sec_get_hw_err_status(qm); if (err_status) { - if (err_status & qm->err_info.ecc_2bits_mask) + if (err_status & qm->err_info.dev_err.ecc_2bits_mask) qm->err_status.is_dev_ecc_mbit = true; sec_log_hw_error(qm, err_status); - if (err_status & qm->err_info.dev_reset_mask) { + if (err_status & qm->err_info.dev_err.reset_mask) { /* Disable the same error reporting until device is recovered. */ sec_disable_error_report(qm, err_status); return ACC_ERR_NEED_RESET; @@ -1149,28 +1147,62 @@ static bool sec_dev_is_abnormal(struct hisi_qm *qm) u32 err_status; err_status = sec_get_hw_err_status(qm); - if (err_status & qm->err_info.dev_shutdown_mask) + if (err_status & qm->err_info.dev_err.shutdown_mask) return true; return false; } +static void sec_disable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + + writel(err_mask & ~SEC_AXI_ERROR_MASK, qm->io_base + SEC_CORE_INT_MASK); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask & (~SEC_AXI_ERROR_MASK), + qm->io_base + SEC_OOO_SHUTDOWN_SEL); +} + +static void sec_enable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + + /* clear axi error source */ + writel(SEC_AXI_ERROR_MASK, qm->io_base + SEC_CORE_INT_SOURCE); + + writel(err_mask, qm->io_base + SEC_CORE_INT_MASK); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask, qm->io_base + SEC_OOO_SHUTDOWN_SEL); +} + static void sec_err_info_init(struct hisi_qm *qm) { struct hisi_qm_err_info *err_info = &qm->err_info; + struct hisi_qm_err_mask *qm_err = &err_info->qm_err; + struct hisi_qm_err_mask *dev_err = &err_info->dev_err; + + qm_err->fe = SEC_RAS_FE_ENB_MSK; + qm_err->ce = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_QM_CE_MASK_CAP, qm->cap_ver); + qm_err->nfe = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_QM_NFE_MASK_CAP, qm->cap_ver); + qm_err->shutdown_mask = hisi_qm_get_hw_info(qm, sec_basic_info, + SEC_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + qm_err->reset_mask = hisi_qm_get_hw_info(qm, sec_basic_info, + SEC_QM_RESET_MASK_CAP, qm->cap_ver); + qm_err->ecc_2bits_mask = QM_ECC_MBIT; + + dev_err->fe = SEC_RAS_FE_ENB_MSK; + dev_err->ce = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_CE_MASK_CAP, qm->cap_ver); + dev_err->nfe = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_NFE_MASK_CAP, qm->cap_ver); + dev_err->shutdown_mask = hisi_qm_get_hw_info(qm, sec_basic_info, + SEC_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + dev_err->reset_mask = hisi_qm_get_hw_info(qm, sec_basic_info, + SEC_RESET_MASK_CAP, qm->cap_ver); + dev_err->ecc_2bits_mask = SEC_CORE_INT_STATUS_M_ECC; - err_info->fe = SEC_RAS_FE_ENB_MSK; - err_info->ce = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_QM_CE_MASK_CAP, qm->cap_ver); - err_info->nfe = hisi_qm_get_hw_info(qm, sec_basic_info, SEC_QM_NFE_MASK_CAP, qm->cap_ver); - err_info->ecc_2bits_mask = SEC_CORE_INT_STATUS_M_ECC; - err_info->qm_shutdown_mask = hisi_qm_get_hw_info(qm, sec_basic_info, - SEC_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->dev_shutdown_mask = hisi_qm_get_hw_info(qm, sec_basic_info, - SEC_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->qm_reset_mask = hisi_qm_get_hw_info(qm, sec_basic_info, - SEC_QM_RESET_MASK_CAP, qm->cap_ver); - err_info->dev_reset_mask = hisi_qm_get_hw_info(qm, sec_basic_info, - SEC_RESET_MASK_CAP, qm->cap_ver); err_info->msi_wr_port = BIT(0); err_info->acpi_rst = "SRST"; } @@ -1188,6 +1220,8 @@ static const struct hisi_qm_err_ini sec_err_ini = { .err_info_init = sec_err_info_init, .get_err_result = sec_get_err_result, .dev_is_abnormal = sec_dev_is_abnormal, + .disable_axi_error = sec_disable_axi_error, + .enable_axi_error = sec_enable_axi_error, }; static int sec_pf_probe_init(struct sec_dev *sec) diff --git a/drivers/crypto/hisilicon/zip/zip_main.c b/drivers/crypto/hisilicon/zip/zip_main.c index 62cd090e13af..6b5cad82c856 100644 --- a/drivers/crypto/hisilicon/zip/zip_main.c +++ b/drivers/crypto/hisilicon/zip/zip_main.c @@ -65,6 +65,7 @@ #define HZIP_SRAM_ECC_ERR_NUM_SHIFT 16 #define HZIP_SRAM_ECC_ERR_ADDR_SHIFT 24 #define HZIP_CORE_INT_MASK_ALL GENMASK(12, 0) +#define HZIP_AXI_ERROR_MASK (BIT(2) | BIT(3)) #define HZIP_SQE_SIZE 128 #define HZIP_PF_DEF_Q_NUM 64 #define HZIP_PF_DEF_Q_BASE 0 @@ -662,8 +663,7 @@ static void hisi_zip_master_ooo_ctrl(struct hisi_qm *qm, bool enable) val1 = readl(qm->io_base + HZIP_SOFT_CTRL_ZIP_CONTROL); if (enable) { val1 |= HZIP_AXI_SHUTDOWN_ENABLE; - val2 = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + val2 = qm->err_info.dev_err.shutdown_mask; } else { val1 &= ~HZIP_AXI_SHUTDOWN_ENABLE; val2 = 0x0; @@ -677,7 +677,8 @@ static void hisi_zip_master_ooo_ctrl(struct hisi_qm *qm, bool enable) static void hisi_zip_hw_error_enable(struct hisi_qm *qm) { - u32 nfe, ce; + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; if (qm->ver == QM_HW_V1) { writel(HZIP_CORE_INT_MASK_ALL, @@ -686,33 +687,29 @@ static void hisi_zip_hw_error_enable(struct hisi_qm *qm) return; } - nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver); - ce = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_CE_MASK_CAP, qm->cap_ver); - /* clear ZIP hw error source if having */ - writel(ce | nfe | HZIP_CORE_INT_RAS_FE_ENB_MASK, qm->io_base + HZIP_CORE_INT_SOURCE); + writel(err_mask, qm->io_base + HZIP_CORE_INT_SOURCE); /* configure error type */ - writel(ce, qm->io_base + HZIP_CORE_INT_RAS_CE_ENB); - writel(HZIP_CORE_INT_RAS_FE_ENB_MASK, qm->io_base + HZIP_CORE_INT_RAS_FE_ENB); - writel(nfe, qm->io_base + HZIP_CORE_INT_RAS_NFE_ENB); + writel(dev_err->ce, qm->io_base + HZIP_CORE_INT_RAS_CE_ENB); + writel(dev_err->fe, qm->io_base + HZIP_CORE_INT_RAS_FE_ENB); + writel(dev_err->nfe, qm->io_base + HZIP_CORE_INT_RAS_NFE_ENB); hisi_zip_master_ooo_ctrl(qm, true); /* enable ZIP hw error interrupts */ - writel(0, qm->io_base + HZIP_CORE_INT_MASK_REG); + writel(~err_mask, qm->io_base + HZIP_CORE_INT_MASK_REG); hisi_dae_hw_error_enable(qm); } static void hisi_zip_hw_error_disable(struct hisi_qm *qm) { - u32 nfe, ce; + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; /* disable ZIP hw error interrupts */ - nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver); - ce = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_CE_MASK_CAP, qm->cap_ver); - writel(ce | nfe | HZIP_CORE_INT_RAS_FE_ENB_MASK, qm->io_base + HZIP_CORE_INT_MASK_REG); + writel(err_mask, qm->io_base + HZIP_CORE_INT_MASK_REG); hisi_zip_master_ooo_ctrl(qm, false); @@ -1186,9 +1183,8 @@ static void hisi_zip_clear_hw_err_status(struct hisi_qm *qm, u32 err_sts) static void hisi_zip_disable_error_report(struct hisi_qm *qm, u32 err_type) { - u32 nfe_mask; + u32 nfe_mask = qm->err_info.dev_err.nfe; - nfe_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver); writel(nfe_mask & (~err_type), qm->io_base + HZIP_CORE_INT_RAS_NFE_ENB); } @@ -1230,14 +1226,14 @@ static enum acc_err_result hisi_zip_get_err_result(struct hisi_qm *qm) /* Get device hardware new error status */ err_status = hisi_zip_get_hw_err_status(qm); if (err_status) { - if (err_status & qm->err_info.ecc_2bits_mask) + if (err_status & qm->err_info.dev_err.ecc_2bits_mask) qm->err_status.is_dev_ecc_mbit = true; hisi_zip_log_hw_error(qm, err_status); - if (err_status & qm->err_info.dev_reset_mask) { + if (err_status & qm->err_info.dev_err.reset_mask) { /* Disable the same error reporting until device is recovered. */ hisi_zip_disable_error_report(qm, err_status); - return ACC_ERR_NEED_RESET; + zip_result = ACC_ERR_NEED_RESET; } else { hisi_zip_clear_hw_err_status(qm, err_status); } @@ -1255,7 +1251,7 @@ static bool hisi_zip_dev_is_abnormal(struct hisi_qm *qm) u32 err_status; err_status = hisi_zip_get_hw_err_status(qm); - if (err_status & qm->err_info.dev_shutdown_mask) + if (err_status & qm->err_info.dev_err.shutdown_mask) return true; return hisi_dae_dev_is_abnormal(qm); @@ -1266,23 +1262,59 @@ static int hisi_zip_set_priv_status(struct hisi_qm *qm) return hisi_dae_close_axi_master_ooo(qm); } +static void hisi_zip_disable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + u32 val; + + val = ~(err_mask & (~HZIP_AXI_ERROR_MASK)); + writel(val, qm->io_base + HZIP_CORE_INT_MASK_REG); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask & (~HZIP_AXI_ERROR_MASK), + qm->io_base + HZIP_OOO_SHUTDOWN_SEL); +} + +static void hisi_zip_enable_axi_error(struct hisi_qm *qm) +{ + struct hisi_qm_err_mask *dev_err = &qm->err_info.dev_err; + u32 err_mask = dev_err->ce | dev_err->nfe | dev_err->fe; + + /* clear axi error source */ + writel(HZIP_AXI_ERROR_MASK, qm->io_base + HZIP_CORE_INT_SOURCE); + + writel(~err_mask, qm->io_base + HZIP_CORE_INT_MASK_REG); + + if (qm->ver > QM_HW_V2) + writel(dev_err->shutdown_mask, qm->io_base + HZIP_OOO_SHUTDOWN_SEL); +} + static void hisi_zip_err_info_init(struct hisi_qm *qm) { struct hisi_qm_err_info *err_info = &qm->err_info; + struct hisi_qm_err_mask *qm_err = &err_info->qm_err; + struct hisi_qm_err_mask *dev_err = &err_info->dev_err; + + qm_err->fe = HZIP_CORE_INT_RAS_FE_ENB_MASK; + qm_err->ce = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_QM_CE_MASK_CAP, qm->cap_ver); + qm_err->nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, + ZIP_QM_NFE_MASK_CAP, qm->cap_ver); + qm_err->ecc_2bits_mask = QM_ECC_MBIT; + qm_err->reset_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, + ZIP_QM_RESET_MASK_CAP, qm->cap_ver); + qm_err->shutdown_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, + ZIP_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + + dev_err->fe = HZIP_CORE_INT_RAS_FE_ENB_MASK; + dev_err->ce = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_CE_MASK_CAP, qm->cap_ver); + dev_err->nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_NFE_MASK_CAP, qm->cap_ver); + dev_err->ecc_2bits_mask = HZIP_CORE_INT_STATUS_M_ECC; + dev_err->shutdown_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, + ZIP_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); + dev_err->reset_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, + ZIP_RESET_MASK_CAP, qm->cap_ver); - err_info->fe = HZIP_CORE_INT_RAS_FE_ENB_MASK; - err_info->ce = hisi_qm_get_hw_info(qm, zip_basic_cap_info, ZIP_QM_CE_MASK_CAP, qm->cap_ver); - err_info->nfe = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_QM_NFE_MASK_CAP, qm->cap_ver); - err_info->ecc_2bits_mask = HZIP_CORE_INT_STATUS_M_ECC; - err_info->qm_shutdown_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_QM_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->dev_shutdown_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_OOO_SHUTDOWN_MASK_CAP, qm->cap_ver); - err_info->qm_reset_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_QM_RESET_MASK_CAP, qm->cap_ver); - err_info->dev_reset_mask = hisi_qm_get_hw_info(qm, zip_basic_cap_info, - ZIP_RESET_MASK_CAP, qm->cap_ver); err_info->msi_wr_port = HZIP_WR_PORT; err_info->acpi_rst = "ZRST"; } @@ -1302,6 +1334,8 @@ static const struct hisi_qm_err_ini hisi_zip_err_ini = { .get_err_result = hisi_zip_get_err_result, .set_priv_status = hisi_zip_set_priv_status, .dev_is_abnormal = hisi_zip_dev_is_abnormal, + .disable_axi_error = hisi_zip_disable_axi_error, + .enable_axi_error = hisi_zip_enable_axi_error, }; static int hisi_zip_pf_probe_init(struct hisi_zip *hisi_zip) diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index f2254ddc327c..c4690e365ade 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -104,6 +104,8 @@ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ #define UACCE_MODE_DESC "0(default) means only register to crypto, 1 means both register to crypto and uacce" +#define QM_ECC_MBIT BIT(2) + enum qm_stop_reason { QM_NORMAL, QM_SOFT_RESET, @@ -240,19 +242,22 @@ enum acc_err_result { ACC_ERR_RECOVERED, }; -struct hisi_qm_err_info { - char *acpi_rst; - u32 msi_wr_port; +struct hisi_qm_err_mask { u32 ecc_2bits_mask; - u32 qm_shutdown_mask; - u32 dev_shutdown_mask; - u32 qm_reset_mask; - u32 dev_reset_mask; + u32 shutdown_mask; + u32 reset_mask; u32 ce; u32 nfe; u32 fe; }; +struct hisi_qm_err_info { + char *acpi_rst; + u32 msi_wr_port; + struct hisi_qm_err_mask qm_err; + struct hisi_qm_err_mask dev_err; +}; + struct hisi_qm_err_status { u32 is_qm_ecc_mbit; u32 is_dev_ecc_mbit; @@ -273,6 +278,8 @@ struct hisi_qm_err_ini { enum acc_err_result (*get_err_result)(struct hisi_qm *qm); bool (*dev_is_abnormal)(struct hisi_qm *qm); int (*set_priv_status)(struct hisi_qm *qm); + void (*disable_axi_error)(struct hisi_qm *qm); + void (*enable_axi_error)(struct hisi_qm *qm); }; struct hisi_qm_cap_info { -- cgit v1.2.3 From 79525b51acc1c8e331ab47eb131a99f5370a76c2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 19 Sep 2025 12:38:58 -0700 Subject: io_uring: fix nvme's 32b cqes on mixed cq The nvme uring_cmd only uses 32b CQEs. If the ring uses a mixed CQ, then we need to make sure we flag the completion as a 32b CQE. On the other hand, if nvme uring_cmd was using a dedicated 32b CQE, the posting was missing the extra memcpy because it only applied to bit CQEs on a mixed CQ. Fixes: e26dca67fde1943 ("io_uring: add support for IORING_SETUP_CQE_MIXED") Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 2 +- include/linux/io_uring/cmd.h | 20 ++++++++++++++++---- io_uring/io_uring.h | 2 +- io_uring/uring_cmd.c | 11 +++++++---- 4 files changed, 25 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 6b3ac8ae3f34..e28bb9113f64 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -410,7 +410,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, if (pdu->bio) blk_rq_unmap_user(pdu->bio); - io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags); } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c8185f54fde9..02d50f08f668 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -56,8 +56,8 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, * Note: the caller should never hard code @issue_flags and is only allowed * to pass the mask provided by the core io_uring code. */ -void io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, - unsigned issue_flags); +void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, + unsigned issue_flags, bool is_cqe32); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, io_uring_cmd_tw_t task_work_cb, @@ -104,8 +104,8 @@ static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, - u64 ret2, unsigned issue_flags) +static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, + u64 ret2, unsigned issue_flags, bool is_cqe32) { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, @@ -159,6 +159,18 @@ static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) return cmd_to_io_kiocb(cmd)->ctx; } +static inline void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, + u64 res2, unsigned issue_flags) +{ + return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, false); +} + +static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, + u64 res2, unsigned issue_flags) +{ + return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true); +} + int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e92099d8fcb7..af4c11310652 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -275,7 +275,7 @@ static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, return false; memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (is_cqe32) { + if (ctx->flags & IORING_SETUP_CQE32 || is_cqe32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 5562e8491c5b..a688c9f1a21c 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -151,8 +151,8 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, - unsigned issue_flags) +void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, + unsigned issue_flags, bool is_cqe32) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -165,8 +165,11 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, req_set_fail(req); io_req_set_res(req, ret, 0); - if (req->ctx->flags & IORING_SETUP_CQE32) + if (is_cqe32) { + if (req->ctx->flags & IORING_SETUP_CQE_MIXED) + req->cqe.flags |= IORING_CQE_F_32; io_req_set_cqe32_extra(req, res2, 0); + } io_req_uring_cleanup(req, issue_flags); if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ @@ -180,7 +183,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, io_req_task_work_add(req); } } -EXPORT_SYMBOL_GPL(io_uring_cmd_done); +EXPORT_SYMBOL_GPL(__io_uring_cmd_done); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -- cgit v1.2.3 From 1e338f4d99e6814ede16bad1db1cc463aad8032c Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Sun, 10 Aug 2025 17:57:45 +0500 Subject: kasan: introduce ARCH_DEFER_KASAN and unify static key across modes Patch series "kasan: unify kasan_enabled() and remove arch-specific implementations", v6. This patch series addresses the fragmentation in KASAN initialization across architectures by introducing a unified approach that eliminates duplicate static keys and arch-specific kasan_arch_is_ready() implementations. The core issue is that different architectures have inconsistent approaches to KASAN readiness tracking: - PowerPC, LoongArch, and UML arch, each implement own kasan_arch_is_ready() - Only HW_TAGS mode had a unified static key (kasan_flag_enabled) - Generic and SW_TAGS modes relied on arch-specific solutions or always-on behavior This patch (of 2): Introduce CONFIG_ARCH_DEFER_KASAN to identify architectures [1] that need to defer KASAN initialization until shadow memory is properly set up, and unify the static key infrastructure across all KASAN modes. [1] PowerPC, UML, LoongArch selects ARCH_DEFER_KASAN. The core issue is that different architectures haveinconsistent approaches to KASAN readiness tracking: - PowerPC, LoongArch, and UML arch, each implement own kasan_arch_is_ready() - Only HW_TAGS mode had a unified static key (kasan_flag_enabled) - Generic and SW_TAGS modes relied on arch-specific solutions or always-on behavior This patch addresses the fragmentation in KASAN initialization across architectures by introducing a unified approach that eliminates duplicate static keys and arch-specific kasan_arch_is_ready() implementations. Let's replace kasan_arch_is_ready() with existing kasan_enabled() check, which examines the static key being enabled if arch selects ARCH_DEFER_KASAN or has HW_TAGS mode support. For other arch, kasan_enabled() checks the enablement during compile time. Now KASAN users can use a single kasan_enabled() check everywhere. Link: https://lkml.kernel.org/r/20250810125746.1105476-1-snovitoll@gmail.com Link: https://lkml.kernel.org/r/20250810125746.1105476-2-snovitoll@gmail.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217049 Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Christophe Leroy Reviewed-by: Ritesh Harjani (IBM) #powerpc Cc: Alexander Gordeev Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Baoquan He Cc: David Gow Cc: Dmitriy Vyukov Cc: Heiko Carstens Cc: Huacai Chen Cc: Marco Elver Cc: Qing Zhang Cc: Sabyrzhan Tasbolatov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/kasan.h | 7 ------- arch/loongarch/mm/kasan_init.c | 8 +++----- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/kasan.h | 12 ------------ arch/powerpc/mm/kasan/init_32.c | 2 +- arch/powerpc/mm/kasan/init_book3e_64.c | 2 +- arch/powerpc/mm/kasan/init_book3s_64.c | 6 +----- arch/um/Kconfig | 1 + arch/um/include/asm/kasan.h | 5 ++--- arch/um/kernel/mem.c | 13 ++++++++++--- include/linux/kasan-enabled.h | 32 +++++++++++++++++++++++--------- include/linux/kasan.h | 6 ++++++ lib/Kconfig.kasan | 12 ++++++++++++ mm/kasan/common.c | 17 +++++++++++++---- mm/kasan/generic.c | 19 +++++++++++++++---- mm/kasan/hw_tags.c | 9 +-------- mm/kasan/kasan.h | 8 +++++++- mm/kasan/shadow.c | 12 ++++++------ mm/kasan/sw_tags.c | 1 + mm/kasan/tags.c | 2 +- 21 files changed, 106 insertions(+), 70 deletions(-) (limited to 'include/linux') diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f0abc38c40ac..e449e3fcecf9 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -9,6 +9,7 @@ config LOONGARCH select ACPI_PPTT if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_BINFMT_ELF_STATE + select ARCH_NEEDS_DEFER_KASAN select ARCH_DISABLE_KASAN_INLINE select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE diff --git a/arch/loongarch/include/asm/kasan.h b/arch/loongarch/include/asm/kasan.h index 62f139a9c87d..0e50e5b5e056 100644 --- a/arch/loongarch/include/asm/kasan.h +++ b/arch/loongarch/include/asm/kasan.h @@ -66,7 +66,6 @@ #define XKPRANGE_WC_SHADOW_OFFSET (KASAN_SHADOW_START + XKPRANGE_WC_KASAN_OFFSET) #define XKVRANGE_VC_SHADOW_OFFSET (KASAN_SHADOW_START + XKVRANGE_VC_KASAN_OFFSET) -extern bool kasan_early_stage; extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; #define kasan_mem_to_shadow kasan_mem_to_shadow @@ -75,12 +74,6 @@ void *kasan_mem_to_shadow(const void *addr); #define kasan_shadow_to_mem kasan_shadow_to_mem const void *kasan_shadow_to_mem(const void *shadow_addr); -#define kasan_arch_is_ready kasan_arch_is_ready -static __always_inline bool kasan_arch_is_ready(void) -{ - return !kasan_early_stage; -} - #define addr_has_metadata addr_has_metadata static __always_inline bool addr_has_metadata(const void *addr) { diff --git a/arch/loongarch/mm/kasan_init.c b/arch/loongarch/mm/kasan_init.c index d2681272d8f0..170da98ad4f5 100644 --- a/arch/loongarch/mm/kasan_init.c +++ b/arch/loongarch/mm/kasan_init.c @@ -40,11 +40,9 @@ static pgd_t kasan_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); #define __pte_none(early, pte) (early ? pte_none(pte) : \ ((pte_val(pte) & _PFN_MASK) == (unsigned long)__pa(kasan_early_shadow_page))) -bool kasan_early_stage = true; - void *kasan_mem_to_shadow(const void *addr) { - if (!kasan_arch_is_ready()) { + if (!kasan_enabled()) { return (void *)(kasan_early_shadow_page); } else { unsigned long maddr = (unsigned long)addr; @@ -298,7 +296,8 @@ void __init kasan_init(void) kasan_populate_early_shadow(kasan_mem_to_shadow((void *)VMALLOC_START), kasan_mem_to_shadow((void *)KFENCE_AREA_END)); - kasan_early_stage = false; + /* Enable KASAN here before kasan_mem_to_shadow(). */ + kasan_init_generic(); /* Populate the linear mapping */ for_each_mem_range(i, &pa_start, &pa_end) { @@ -329,5 +328,4 @@ void __init kasan_init(void) /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; - pr_info("KernelAddressSanitizer initialized.\n"); } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 93402a1d9c9f..4730c676b6bf 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -122,6 +122,7 @@ config PPC # Please keep this list sorted alphabetically. # select ARCH_32BIT_OFF_T if PPC32 + select ARCH_NEEDS_DEFER_KASAN if PPC_RADIX_MMU select ARCH_DISABLE_KASAN_INLINE if PPC_RADIX_MMU select ARCH_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE select ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index b5bbb94c51f6..957a57c1db58 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -53,18 +53,6 @@ #endif #ifdef CONFIG_KASAN -#ifdef CONFIG_PPC_BOOK3S_64 -DECLARE_STATIC_KEY_FALSE(powerpc_kasan_enabled_key); - -static __always_inline bool kasan_arch_is_ready(void) -{ - if (static_branch_likely(&powerpc_kasan_enabled_key)) - return true; - return false; -} - -#define kasan_arch_is_ready kasan_arch_is_ready -#endif void kasan_early_init(void); void kasan_mmu_init(void); diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c index 03666d790a53..1d083597464f 100644 --- a/arch/powerpc/mm/kasan/init_32.c +++ b/arch/powerpc/mm/kasan/init_32.c @@ -165,7 +165,7 @@ void __init kasan_init(void) /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); + kasan_init_generic(); } void __init kasan_late_init(void) diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c index 60c78aac0f63..0d3a73d6d4b0 100644 --- a/arch/powerpc/mm/kasan/init_book3e_64.c +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -127,7 +127,7 @@ void __init kasan_init(void) /* Enable error messages */ init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); + kasan_init_generic(); } void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c index 7d959544c077..dcafa641804c 100644 --- a/arch/powerpc/mm/kasan/init_book3s_64.c +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -19,8 +19,6 @@ #include #include -DEFINE_STATIC_KEY_FALSE(powerpc_kasan_enabled_key); - static void __init kasan_init_phys_region(void *start, void *end) { unsigned long k_start, k_end, k_cur; @@ -92,11 +90,9 @@ void __init kasan_init(void) */ memset(kasan_early_shadow_page, 0, PAGE_SIZE); - static_branch_inc(&powerpc_kasan_enabled_key); - /* Enable error messages */ init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); + kasan_init_generic(); } void __init kasan_early_init(void) { } diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 9083bfdb7735..1d4def0db841 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -5,6 +5,7 @@ menu "UML-specific options" config UML bool default y + select ARCH_NEEDS_DEFER_KASAN if STATIC_LINK select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CPU_FINALIZE_INIT diff --git a/arch/um/include/asm/kasan.h b/arch/um/include/asm/kasan.h index f97bb1f7b851..b54a4e937fd1 100644 --- a/arch/um/include/asm/kasan.h +++ b/arch/um/include/asm/kasan.h @@ -24,10 +24,9 @@ #ifdef CONFIG_KASAN void kasan_init(void); -extern int kasan_um_is_ready; -#ifdef CONFIG_STATIC_LINK -#define kasan_arch_is_ready() (kasan_um_is_ready) +#if defined(CONFIG_STATIC_LINK) && defined(CONFIG_KASAN_INLINE) +#error UML does not work in KASAN_INLINE mode with STATIC_LINK enabled! #endif #else static inline void kasan_init(void) { } diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 76bec7de81b5..32e3b1972dc1 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -21,10 +21,10 @@ #include #include #include +#include #ifdef CONFIG_KASAN -int kasan_um_is_ready; -void kasan_init(void) +void __init kasan_init(void) { /* * kasan_map_memory will map all of the required address space and @@ -32,7 +32,11 @@ void kasan_init(void) */ kasan_map_memory((void *)KASAN_SHADOW_START, KASAN_SHADOW_SIZE); init_task.kasan_depth = 0; - kasan_um_is_ready = true; + /* + * Since kasan_init() is called before main(), + * KASAN is initialized but the enablement is deferred after + * jump_label_init(). See arch_mm_preinit(). + */ } static void (*kasan_init_ptr)(void) @@ -58,6 +62,9 @@ static unsigned long brk_end; void __init arch_mm_preinit(void) { + /* Safe to call after jump_label_init(). Enables KASAN. */ + kasan_init_generic(); + /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); diff --git a/include/linux/kasan-enabled.h b/include/linux/kasan-enabled.h index 6f612d69ea0c..9eca967d8526 100644 --- a/include/linux/kasan-enabled.h +++ b/include/linux/kasan-enabled.h @@ -4,32 +4,46 @@ #include -#ifdef CONFIG_KASAN_HW_TAGS - +#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS) +/* + * Global runtime flag for KASAN modes that need runtime control. + * Used by ARCH_DEFER_KASAN architectures and HW_TAGS mode. + */ DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled); +/* + * Runtime control for shadow memory initialization or HW_TAGS mode. + * Uses static key for architectures that need deferred KASAN or HW_TAGS. + */ static __always_inline bool kasan_enabled(void) { return static_branch_likely(&kasan_flag_enabled); } -static inline bool kasan_hw_tags_enabled(void) +static inline void kasan_enable(void) { - return kasan_enabled(); + static_branch_enable(&kasan_flag_enabled); } - -#else /* CONFIG_KASAN_HW_TAGS */ - -static inline bool kasan_enabled(void) +#else +/* For architectures that can enable KASAN early, use compile-time check. */ +static __always_inline bool kasan_enabled(void) { return IS_ENABLED(CONFIG_KASAN); } +static inline void kasan_enable(void) {} +#endif /* CONFIG_ARCH_DEFER_KASAN || CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_HW_TAGS +static inline bool kasan_hw_tags_enabled(void) +{ + return kasan_enabled(); +} +#else static inline bool kasan_hw_tags_enabled(void) { return false; } - #endif /* CONFIG_KASAN_HW_TAGS */ #endif /* LINUX_KASAN_ENABLED_H */ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index fe5ce9215821..b509a8d36949 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -543,6 +543,12 @@ void kasan_report_async(void); #endif /* CONFIG_KASAN_HW_TAGS */ +#ifdef CONFIG_KASAN_GENERIC +void __init kasan_init_generic(void); +#else +static inline void kasan_init_generic(void) { } +#endif + #ifdef CONFIG_KASAN_SW_TAGS void __init kasan_init_sw_tags(void); #else diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index f82889a830fa..a4bb610a7a6f 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -19,6 +19,18 @@ config ARCH_DISABLE_KASAN_INLINE Disables both inline and stack instrumentation. Selected by architectures that do not support these instrumentation types. +config ARCH_NEEDS_DEFER_KASAN + bool + +config ARCH_DEFER_KASAN + def_bool y + depends on KASAN && ARCH_NEEDS_DEFER_KASAN + help + Architectures should select this if they need to defer KASAN + initialization until shadow memory is properly set up. This + enables runtime control via static keys. Otherwise, KASAN uses + compile-time constants for better performance. + config CC_HAS_KASAN_GENERIC def_bool $(cc-option, -fsanitize=kernel-address) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 9142964ab9c9..e3765931a31f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -32,6 +32,15 @@ #include "kasan.h" #include "../slab.h" +#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS) +/* + * Definition of the unified static key declared in kasan-enabled.h. + * This provides consistent runtime enable/disable across KASAN modes. + */ +DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); +EXPORT_SYMBOL_GPL(kasan_flag_enabled); +#endif + struct slab *kasan_addr_to_slab(const void *addr) { if (virt_addr_valid(addr)) @@ -246,7 +255,7 @@ static inline void poison_slab_object(struct kmem_cache *cache, void *object, bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, unsigned long ip) { - if (!kasan_arch_is_ready() || is_kfence_address(object)) + if (is_kfence_address(object)) return false; return check_slab_allocation(cache, object, ip); } @@ -254,7 +263,7 @@ bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, bool still_accessible) { - if (!kasan_arch_is_ready() || is_kfence_address(object)) + if (is_kfence_address(object)) return false; /* @@ -293,7 +302,7 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, static inline bool check_page_allocation(void *ptr, unsigned long ip) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return false; if (ptr != page_address(virt_to_head_page(ptr))) { @@ -522,7 +531,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) return true; } - if (is_kfence_address(ptr) || !kasan_arch_is_ready()) + if (is_kfence_address(ptr)) return true; slab = folio_slab(folio); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index d54e89f8c3e7..b413c46b3e04 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -36,6 +36,17 @@ #include "kasan.h" #include "../slab.h" +/* + * Initialize Generic KASAN and enable runtime checks. + * This should be called from arch kasan_init() once shadow memory is ready. + */ +void __init kasan_init_generic(void) +{ + kasan_enable(); + + pr_info("KernelAddressSanitizer initialized (generic)\n"); +} + /* * All functions below always inlined so compiler could * perform better optimizations in each of __asan_loadX/__assn_storeX @@ -165,7 +176,7 @@ static __always_inline bool check_region_inline(const void *addr, size_t size, bool write, unsigned long ret_ip) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return true; if (unlikely(size == 0)) @@ -193,7 +204,7 @@ bool kasan_byte_accessible(const void *addr) { s8 shadow_byte; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return true; shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); @@ -495,7 +506,7 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; /* Check if free meta is valid. */ @@ -562,7 +573,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) kasan_save_track(&alloc_meta->alloc_track, flags); } -void kasan_save_free_info(struct kmem_cache *cache, void *object) +void __kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9a6927394b54..c8289a3feabf 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -45,13 +45,6 @@ static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; -/* - * Whether KASAN is enabled at all. - * The value remains false until KASAN is initialized by kasan_init_hw_tags(). - */ -DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); -EXPORT_SYMBOL(kasan_flag_enabled); - /* * Whether the selected mode is synchronous, asynchronous, or asymmetric. * Defaults to KASAN_MODE_SYNC. @@ -260,7 +253,7 @@ void __init kasan_init_hw_tags(void) kasan_init_tags(); /* KASAN is now initialized, enable it. */ - static_branch_enable(&kasan_flag_enabled); + kasan_enable(); pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", kasan_mode_info(), diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 129178be5e64..8a9d8a6ea717 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -398,7 +398,13 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags); void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack); void kasan_save_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); -void kasan_save_free_info(struct kmem_cache *cache, void *object); + +void __kasan_save_free_info(struct kmem_cache *cache, void *object); +static inline void kasan_save_free_info(struct kmem_cache *cache, void *object) +{ + if (kasan_enabled()) + __kasan_save_free_info(cache, object); +} #ifdef CONFIG_KASAN_GENERIC bool kasan_quarantine_put(struct kmem_cache *cache, void *object); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 11d472a5c4e8..5d2a876035d6 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -125,7 +125,7 @@ void kasan_poison(const void *addr, size_t size, u8 value, bool init) { void *shadow_start, *shadow_end; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; /* @@ -150,7 +150,7 @@ EXPORT_SYMBOL_GPL(kasan_poison); #ifdef CONFIG_KASAN_GENERIC void kasan_poison_last_granule(const void *addr, size_t size) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; if (size & KASAN_GRANULE_MASK) { @@ -408,7 +408,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mas unsigned long shadow_start, shadow_end; int ret; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return 0; if (!is_vmalloc_or_module_addr((void *)addr)) @@ -583,7 +583,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -634,7 +634,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return (void *)start; if (!is_vmalloc_or_module_addr(start)) @@ -659,7 +659,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; if (!is_vmalloc_or_module_addr(start)) diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index b9382b5b6a37..c75741a74602 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -44,6 +44,7 @@ void __init kasan_init_sw_tags(void) per_cpu(prng_state, cpu) = (u32)get_cycles(); kasan_init_tags(); + kasan_enable(); pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", str_on_off(kasan_stack_collection_enabled())); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index d65d48b85f90..b9f31293622b 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -142,7 +142,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) save_stack_info(cache, object, flags, false); } -void kasan_save_free_info(struct kmem_cache *cache, void *object) +void __kasan_save_free_info(struct kmem_cache *cache, void *object) { save_stack_info(cache, object, 0, true); } -- cgit v1.2.3 From 2ccd9fecd9163f168761d4398564c81554f636ef Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 29 Aug 2025 17:15:27 +0100 Subject: mm: remove unused zpool layer With zswap using zsmalloc directly, there are no more in-tree users of this code. Remove it. With zpool gone, zsmalloc is now always a simple dependency and no longer something the user needs to configure. Hide CONFIG_ZSMALLOC from the user and have zswap and zram pull it in as needed. Link: https://lkml.kernel.org/r/20250829162212.208258-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: SeongJae Park Acked-by: Yosry Ahmed Cc: Chengming Zhou Cc: Nhat Pham Cc: Vitaly Wool Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/zswap.rst | 33 +-- Documentation/core-api/mm-api.rst | 1 - Documentation/driver-api/crypto/iaa/iaa-crypto.rst | 2 - MAINTAINERS | 2 - arch/loongarch/configs/loongson3_defconfig | 1 - include/linux/zpool.h | 86 ------ mm/Kconfig | 49 +-- mm/Makefile | 1 - mm/zpool.c | 328 --------------------- mm/zsmalloc.c | 79 ----- tools/testing/selftests/zram/README | 1 - 11 files changed, 23 insertions(+), 560 deletions(-) delete mode 100644 include/linux/zpool.h delete mode 100644 mm/zpool.c (limited to 'include/linux') diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index fd3370aa43fe..283d77217c6f 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -53,26 +53,17 @@ Zswap receives pages for compression from the swap subsystem and is able to evict pages from its own compressed pool on an LRU basis and write them back to the backing swap device in the case that the compressed pool is full. -Zswap makes use of zpool for the managing the compressed memory pool. Each -allocation in zpool is not directly accessible by address. Rather, a handle is +Zswap makes use of zsmalloc for the managing the compressed memory pool. Each +allocation in zsmalloc is not directly accessible by address. Rather, a handle is returned by the allocation routine and that handle must be mapped before being accessed. The compressed memory pool grows on demand and shrinks as compressed -pages are freed. The pool is not preallocated. By default, a zpool -of type selected in ``CONFIG_ZSWAP_ZPOOL_DEFAULT`` Kconfig option is created, -but it can be overridden at boot time by setting the ``zpool`` attribute, -e.g. ``zswap.zpool=zsmalloc``. It can also be changed at runtime using the sysfs -``zpool`` attribute, e.g.:: - - echo zsmalloc > /sys/module/zswap/parameters/zpool - -The zsmalloc type zpool has a complex compressed page storage method, and it -can achieve great storage densities. +pages are freed. The pool is not preallocated. When a swap page is passed from swapout to zswap, zswap maintains a mapping -of the swap entry, a combination of the swap type and swap offset, to the zpool -handle that references that compressed swap page. This mapping is achieved -with a red-black tree per swap type. The swap offset is the search key for the -tree nodes. +of the swap entry, a combination of the swap type and swap offset, to the +zsmalloc handle that references that compressed swap page. This mapping is +achieved with a red-black tree per swap type. The swap offset is the search +key for the tree nodes. During a page fault on a PTE that is a swap entry, the swapin code calls the zswap load function to decompress the page into the page allocated by the page @@ -96,11 +87,11 @@ attribute, e.g.:: echo lzo > /sys/module/zswap/parameters/compressor -When the zpool and/or compressor parameter is changed at runtime, any existing -compressed pages are not modified; they are left in their own zpool. When a -request is made for a page in an old zpool, it is uncompressed using its -original compressor. Once all pages are removed from an old zpool, the zpool -and its compressor are freed. +When the compressor parameter is changed at runtime, any existing compressed +pages are not modified; they are left in their own pool. When a request is +made for a page in an old pool, it is uncompressed using its original +compressor. Once all pages are removed from an old pool, the pool and its +compressor are freed. Some of the pages in zswap are same-value filled pages (i.e. contents of the page have same value or repetitive pattern). These pages include zero-filled diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 5063179cfc70..68193a4cfcf5 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -118,7 +118,6 @@ More Memory Management Functions .. kernel-doc:: mm/memremap.c .. kernel-doc:: mm/hugetlb.c .. kernel-doc:: mm/swap.c -.. kernel-doc:: mm/zpool.c .. kernel-doc:: mm/memcontrol.c .. #kernel-doc:: mm/memory-tiers.c (build warnings) .. kernel-doc:: mm/shmem.c diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst index 8e50b900d51c..f815d4fd8372 100644 --- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst +++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst @@ -476,7 +476,6 @@ Use the following commands to enable zswap:: # echo 0 > /sys/module/zswap/parameters/enabled # echo 50 > /sys/module/zswap/parameters/max_pool_percent # echo deflate-iaa > /sys/module/zswap/parameters/compressor - # echo zsmalloc > /sys/module/zswap/parameters/zpool # echo 1 > /sys/module/zswap/parameters/enabled # echo 100 > /proc/sys/vm/swappiness # echo never > /sys/kernel/mm/transparent_hugepage/enabled @@ -625,7 +624,6 @@ the 'fixed' compression mode:: echo 0 > /sys/module/zswap/parameters/enabled echo 50 > /sys/module/zswap/parameters/max_pool_percent echo deflate-iaa > /sys/module/zswap/parameters/compressor - echo zsmalloc > /sys/module/zswap/parameters/zpool echo 1 > /sys/module/zswap/parameters/enabled echo 100 > /proc/sys/vm/swappiness diff --git a/MAINTAINERS b/MAINTAINERS index 9344c33c52e1..a7e123ddf05a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27879,9 +27879,7 @@ R: Chengming Zhou L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/zswap.rst -F: include/linux/zpool.h F: include/linux/zswap.h -F: mm/zpool.c F: mm/zswap.c F: tools/testing/selftests/cgroup/test_zswap.c diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 34eaee0384c9..2b8df0e9e42a 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -106,7 +106,6 @@ CONFIG_CMDLINE_PARTITION=y CONFIG_IOSCHED_BFQ=y CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_BINFMT_MISC=m -CONFIG_ZPOOL=y CONFIG_ZSWAP=y CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD=y CONFIG_ZSMALLOC=y diff --git a/include/linux/zpool.h b/include/linux/zpool.h deleted file mode 100644 index 369ef068fad8..000000000000 --- a/include/linux/zpool.h +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * zpool memory storage api - * - * Copyright (C) 2014 Dan Streetman - * - * This is a common frontend for the zswap compressed memory storage - * implementations. - */ - -#ifndef _ZPOOL_H_ -#define _ZPOOL_H_ - -struct zpool; - -bool zpool_has_pool(char *type); - -struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp); - -const char *zpool_get_type(struct zpool *pool); - -void zpool_destroy_pool(struct zpool *pool); - -int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid); - -void zpool_free(struct zpool *pool, unsigned long handle); - -void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, - void *local_copy); - -void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, - void *handle_mem); - -void zpool_obj_write(struct zpool *zpool, unsigned long handle, - void *handle_mem, size_t mem_len); - -u64 zpool_get_total_pages(struct zpool *pool); - - -/** - * struct zpool_driver - driver implementation for zpool - * @type: name of the driver. - * @list: entry in the list of zpool drivers. - * @create: create a new pool. - * @destroy: destroy a pool. - * @malloc: allocate mem from a pool. - * @free: free mem from a pool. - * @sleep_mapped: whether zpool driver can sleep during map. - * @map: map a handle. - * @unmap: unmap a handle. - * @total_size: get total size of a pool. - * - * This is created by a zpool implementation and registered - * with zpool. - */ -struct zpool_driver { - char *type; - struct module *owner; - atomic_t refcount; - struct list_head list; - - void *(*create)(const char *name, gfp_t gfp); - void (*destroy)(void *pool); - - int (*malloc)(void *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid); - void (*free)(void *pool, unsigned long handle); - - void *(*obj_read_begin)(void *pool, unsigned long handle, - void *local_copy); - void (*obj_read_end)(void *pool, unsigned long handle, - void *handle_mem); - void (*obj_write)(void *pool, unsigned long handle, - void *handle_mem, size_t mem_len); - - u64 (*total_pages)(void *pool); -}; - -void zpool_register_driver(struct zpool_driver *driver); - -int zpool_unregister_driver(struct zpool_driver *driver); - -bool zpool_can_sleep_mapped(struct zpool *pool); - -#endif diff --git a/mm/Kconfig b/mm/Kconfig index 4108bcd96784..b971d35c43c3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -9,9 +9,6 @@ menu "Memory Management options" config ARCH_NO_SWAP bool -config ZPOOL - bool - menuconfig SWAP bool "Support for paging of anonymous memory (swap)" depends on MMU && BLOCK && !ARCH_NO_SWAP @@ -26,7 +23,7 @@ config ZSWAP bool "Compressed cache for swap pages" depends on SWAP select CRYPTO - select ZPOOL + select ZSMALLOC help A lightweight compressed cache for swap pages. It takes pages that are in the process of being swapped out and attempts to @@ -125,45 +122,18 @@ config ZSWAP_COMPRESSOR_DEFAULT default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD default "" -choice - prompt "Default allocator" - depends on ZSWAP - default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU - help - Selects the default allocator for the compressed cache for - swap pages. - The default is 'zbud' for compatibility, however please do - read the description of each of the allocators below before - making a right choice. - - The selection made here can be overridden by using the kernel - command line 'zswap.zpool=' option. +config ZSMALLOC + tristate -config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - bool "zsmalloc" - select ZSMALLOC - help - Use the zsmalloc allocator as the default allocator. -endchoice +if ZSMALLOC -config ZSWAP_ZPOOL_DEFAULT - string - depends on ZSWAP - default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - default "" +menu "Zsmalloc allocator options" + depends on ZSMALLOC -config ZSMALLOC - tristate - prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM) - depends on MMU - help - zsmalloc is a slab-based memory allocator designed to store - pages of various compression levels efficiently. It achieves - the highest storage density with the least amount of fragmentation. +comment "Zsmalloc is a common backend allocator for zswap & zram" config ZSMALLOC_STAT bool "Export zsmalloc statistics" - depends on ZSMALLOC select DEBUG_FS help This option enables code in the zsmalloc to collect various @@ -175,7 +145,6 @@ config ZSMALLOC_CHAIN_SIZE int "Maximum number of physical pages per-zspage" default 8 range 4 16 - depends on ZSMALLOC help This option sets the upper limit on the number of physical pages that a zmalloc page (zspage) can consist of. The optimal zspage @@ -190,6 +159,10 @@ config ZSMALLOC_CHAIN_SIZE For more information, see zsmalloc documentation. +endmenu + +endif + menu "Slab allocator options" config SLUB diff --git a/mm/Makefile b/mm/Makefile index ef54aa615d9d..21abb3353550 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -115,7 +115,6 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o -obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o diff --git a/mm/zpool.c b/mm/zpool.c deleted file mode 100644 index 0a71d03369f1..000000000000 --- a/mm/zpool.c +++ /dev/null @@ -1,328 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * zpool memory storage api - * - * Copyright (C) 2014 Dan Streetman - * - * This is a common frontend for memory storage pool implementations. - * Typically, this is used to store compressed memory. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include - -struct zpool { - struct zpool_driver *driver; - void *pool; -}; - -static LIST_HEAD(drivers_head); -static DEFINE_SPINLOCK(drivers_lock); - -/** - * zpool_register_driver() - register a zpool implementation. - * @driver: driver to register - */ -void zpool_register_driver(struct zpool_driver *driver) -{ - spin_lock(&drivers_lock); - atomic_set(&driver->refcount, 0); - list_add(&driver->list, &drivers_head); - spin_unlock(&drivers_lock); -} -EXPORT_SYMBOL(zpool_register_driver); - -/** - * zpool_unregister_driver() - unregister a zpool implementation. - * @driver: driver to unregister. - * - * Module usage counting is used to prevent using a driver - * while/after unloading, so if this is called from module - * exit function, this should never fail; if called from - * other than the module exit function, and this returns - * failure, the driver is in use and must remain available. - */ -int zpool_unregister_driver(struct zpool_driver *driver) -{ - int ret = 0, refcount; - - spin_lock(&drivers_lock); - refcount = atomic_read(&driver->refcount); - WARN_ON(refcount < 0); - if (refcount > 0) - ret = -EBUSY; - else - list_del(&driver->list); - spin_unlock(&drivers_lock); - - return ret; -} -EXPORT_SYMBOL(zpool_unregister_driver); - -/* this assumes @type is null-terminated. */ -static struct zpool_driver *zpool_get_driver(const char *type) -{ - struct zpool_driver *driver; - - spin_lock(&drivers_lock); - list_for_each_entry(driver, &drivers_head, list) { - if (!strcmp(driver->type, type)) { - bool got = try_module_get(driver->owner); - - if (got) - atomic_inc(&driver->refcount); - spin_unlock(&drivers_lock); - return got ? driver : NULL; - } - } - - spin_unlock(&drivers_lock); - return NULL; -} - -static void zpool_put_driver(struct zpool_driver *driver) -{ - atomic_dec(&driver->refcount); - module_put(driver->owner); -} - -/** - * zpool_has_pool() - Check if the pool driver is available - * @type: The type of the zpool to check (e.g. zsmalloc) - * - * This checks if the @type pool driver is available. This will try to load - * the requested module, if needed, but there is no guarantee the module will - * still be loaded and available immediately after calling. If this returns - * true, the caller should assume the pool is available, but must be prepared - * to handle the @zpool_create_pool() returning failure. However if this - * returns false, the caller should assume the requested pool type is not - * available; either the requested pool type module does not exist, or could - * not be loaded, and calling @zpool_create_pool() with the pool type will - * fail. - * - * The @type string must be null-terminated. - * - * Returns: true if @type pool is available, false if not - */ -bool zpool_has_pool(char *type) -{ - struct zpool_driver *driver = zpool_get_driver(type); - - if (!driver) { - request_module("zpool-%s", type); - driver = zpool_get_driver(type); - } - - if (!driver) - return false; - - zpool_put_driver(driver); - return true; -} -EXPORT_SYMBOL(zpool_has_pool); - -/** - * zpool_create_pool() - Create a new zpool - * @type: The type of the zpool to create (e.g. zsmalloc) - * @name: The name of the zpool (e.g. zram0, zswap) - * @gfp: The GFP flags to use when allocating the pool. - * - * This creates a new zpool of the specified type. The gfp flags will be - * used when allocating memory, if the implementation supports it. If the - * ops param is NULL, then the created zpool will not be evictable. - * - * Implementations must guarantee this to be thread-safe. - * - * The @type and @name strings must be null-terminated. - * - * Returns: New zpool on success, NULL on failure. - */ -struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp) -{ - struct zpool_driver *driver; - struct zpool *zpool; - - pr_debug("creating pool type %s\n", type); - - driver = zpool_get_driver(type); - - if (!driver) { - request_module("zpool-%s", type); - driver = zpool_get_driver(type); - } - - if (!driver) { - pr_err("no driver for type %s\n", type); - return NULL; - } - - zpool = kmalloc(sizeof(*zpool), gfp); - if (!zpool) { - pr_err("couldn't create zpool - out of memory\n"); - zpool_put_driver(driver); - return NULL; - } - - zpool->driver = driver; - zpool->pool = driver->create(name, gfp); - - if (!zpool->pool) { - pr_err("couldn't create %s pool\n", type); - zpool_put_driver(driver); - kfree(zpool); - return NULL; - } - - pr_debug("created pool type %s\n", type); - - return zpool; -} - -/** - * zpool_destroy_pool() - Destroy a zpool - * @zpool: The zpool to destroy. - * - * Implementations must guarantee this to be thread-safe, - * however only when destroying different pools. The same - * pool should only be destroyed once, and should not be used - * after it is destroyed. - * - * This destroys an existing zpool. The zpool should not be in use. - */ -void zpool_destroy_pool(struct zpool *zpool) -{ - pr_debug("destroying pool type %s\n", zpool->driver->type); - - zpool->driver->destroy(zpool->pool); - zpool_put_driver(zpool->driver); - kfree(zpool); -} - -/** - * zpool_get_type() - Get the type of the zpool - * @zpool: The zpool to check - * - * This returns the type of the pool. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: The type of zpool. - */ -const char *zpool_get_type(struct zpool *zpool) -{ - return zpool->driver->type; -} - -/** - * zpool_malloc() - Allocate memory - * @zpool: The zpool to allocate from. - * @size: The amount of memory to allocate. - * @gfp: The GFP flags to use when allocating memory. - * @handle: Pointer to the handle to set - * @nid: The preferred node id. - * - * This allocates the requested amount of memory from the pool. - * The gfp flags will be used when allocating memory, if the - * implementation supports it. The provided @handle will be - * set to the allocated object handle. The allocation will - * prefer the NUMA node specified by @nid. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: 0 on success, negative value on error. - */ -int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid) -{ - return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid); -} - -/** - * zpool_free() - Free previously allocated memory - * @zpool: The zpool that allocated the memory. - * @handle: The handle to the memory to free. - * - * This frees previously allocated memory. This does not guarantee - * that the pool will actually free memory, only that the memory - * in the pool will become available for use by the pool. - * - * Implementations must guarantee this to be thread-safe, - * however only when freeing different handles. The same - * handle should only be freed once, and should not be used - * after freeing. - */ -void zpool_free(struct zpool *zpool, unsigned long handle) -{ - zpool->driver->free(zpool->pool, handle); -} - -/** - * zpool_obj_read_begin() - Start reading from a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @local_copy: A local buffer to use if needed. - * - * This starts a read operation of a previously allocated handle. The passed - * @local_copy buffer may be used if needed by copying the memory into. - * zpool_obj_read_end() MUST be called after the read is completed to undo any - * actions taken (e.g. release locks). - * - * Returns: A pointer to the handle memory to be read, if @local_copy is used, - * the returned pointer is @local_copy. - */ -void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, - void *local_copy) -{ - return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy); -} - -/** - * zpool_obj_read_end() - Finish reading from a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @handle_mem: The pointer returned by zpool_obj_read_begin() - * - * Finishes a read operation previously started by zpool_obj_read_begin(). - */ -void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, - void *handle_mem) -{ - zpool->driver->obj_read_end(zpool->pool, handle, handle_mem); -} - -/** - * zpool_obj_write() - Write to a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @handle_mem: The memory to copy from into the handle. - * @mem_len: The length of memory to be written. - * - */ -void zpool_obj_write(struct zpool *zpool, unsigned long handle, - void *handle_mem, size_t mem_len) -{ - zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len); -} - -/** - * zpool_get_total_pages() - The total size of the pool - * @zpool: The zpool to check - * - * This returns the total size in pages of the pool. - * - * Returns: Total size of the zpool in pages. - */ -u64 zpool_get_total_pages(struct zpool *zpool) -{ - return zpool->driver->total_pages(zpool->pool); -} - -MODULE_AUTHOR("Dan Streetman "); -MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 153783d49d34..5bf832f9c05c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include "zpdesc.h" @@ -433,78 +432,6 @@ static void record_obj(unsigned long handle, unsigned long obj) *(unsigned long *)handle = obj; } -/* zpool driver */ - -#ifdef CONFIG_ZPOOL - -static void *zs_zpool_create(const char *name, gfp_t gfp) -{ - /* - * Ignore global gfp flags: zs_malloc() may be invoked from - * different contexts and its caller must provide a valid - * gfp mask. - */ - return zs_create_pool(name); -} - -static void zs_zpool_destroy(void *pool) -{ - zs_destroy_pool(pool); -} - -static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid) -{ - *handle = zs_malloc(pool, size, gfp, nid); - - if (IS_ERR_VALUE(*handle)) - return PTR_ERR((void *)*handle); - return 0; -} -static void zs_zpool_free(void *pool, unsigned long handle) -{ - zs_free(pool, handle); -} - -static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle, - void *local_copy) -{ - return zs_obj_read_begin(pool, handle, local_copy); -} - -static void zs_zpool_obj_read_end(void *pool, unsigned long handle, - void *handle_mem) -{ - zs_obj_read_end(pool, handle, handle_mem); -} - -static void zs_zpool_obj_write(void *pool, unsigned long handle, - void *handle_mem, size_t mem_len) -{ - zs_obj_write(pool, handle, handle_mem, mem_len); -} - -static u64 zs_zpool_total_pages(void *pool) -{ - return zs_get_total_pages(pool); -} - -static struct zpool_driver zs_zpool_driver = { - .type = "zsmalloc", - .owner = THIS_MODULE, - .create = zs_zpool_create, - .destroy = zs_zpool_destroy, - .malloc = zs_zpool_malloc, - .free = zs_zpool_free, - .obj_read_begin = zs_zpool_obj_read_begin, - .obj_read_end = zs_zpool_obj_read_end, - .obj_write = zs_zpool_obj_write, - .total_pages = zs_zpool_total_pages, -}; - -MODULE_ALIAS("zpool-zsmalloc"); -#endif /* CONFIG_ZPOOL */ - static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc) { return PagePrivate(zpdesc_page(zpdesc)); @@ -2248,9 +2175,6 @@ static int __init zs_init(void) { int rc __maybe_unused; -#ifdef CONFIG_ZPOOL - zpool_register_driver(&zs_zpool_driver); -#endif #ifdef CONFIG_COMPACTION rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc); if (rc) @@ -2262,9 +2186,6 @@ static int __init zs_init(void) static void __exit zs_exit(void) { -#ifdef CONFIG_ZPOOL - zpool_unregister_driver(&zs_zpool_driver); -#endif #ifdef CONFIG_COMPACTION set_movable_ops(NULL, PGTY_zsmalloc); #endif diff --git a/tools/testing/selftests/zram/README b/tools/testing/selftests/zram/README index 110b34834a6f..82921c75681c 100644 --- a/tools/testing/selftests/zram/README +++ b/tools/testing/selftests/zram/README @@ -14,7 +14,6 @@ Statistics for individual zram devices are exported through sysfs nodes at Kconfig required: CONFIG_ZRAM=y CONFIG_CRYPTO_LZ4=y -CONFIG_ZPOOL=y CONFIG_ZSMALLOC=y ZRAM Testcases -- cgit v1.2.3 From 0bf2edf041dcb0b304a8dbda8c699771d5a245d2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:27 +0200 Subject: mm/page_alloc: reject unreasonable folio/compound page sizes in alloc_contig_range_noprof() Let's reject them early, which in turn makes folio_alloc_gigantic() reject them properly. To avoid converting from order to nr_pages, let's just add MAX_FOLIO_ORDER and calculate MAX_FOLIO_NR_PAGES based on that. While at it, let's just make the order a "const unsigned order". Link: https://lkml.kernel.org/r/20250901150359.867252-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: SeongJae Park Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- mm/page_alloc.c | 10 +++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 00c8a54127d3..77737cbf2216 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2055,11 +2055,13 @@ static inline long folio_nr_pages(const struct folio *folio) /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) +#define MAX_FOLIO_ORDER PUD_ORDER #else -#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES +#define MAX_FOLIO_ORDER MAX_PAGE_ORDER #endif +#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) + /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0873d640f26c..54dbb6f0d14e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6838,6 +6838,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) int alloc_contig_range_noprof(unsigned long start, unsigned long end, acr_flags_t alloc_flags, gfp_t gfp_mask) { + const unsigned int order = ilog2(end - start); unsigned long outer_start, outer_end; int ret = 0; @@ -6855,6 +6856,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, PB_ISOLATE_MODE_CMA_ALLOC : PB_ISOLATE_MODE_OTHER; + /* + * In contrast to the buddy, we allow for orders here that exceed + * MAX_PAGE_ORDER, so we must manually make sure that we are not + * exceeding the maximum folio order. + */ + if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER)) + return -EINVAL; + gfp_mask = current_gfp_context(gfp_mask); if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) return -EINVAL; @@ -6952,7 +6961,6 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, free_contig_range(end, outer_end - end); } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { struct page *head = pfn_to_page(start); - int order = ilog2(end - start); check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); -- cgit v1.2.3 From 4751c39eee0c3fcc742aa7d7242ce2b78faa3606 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:32 +0200 Subject: mm: limit folio/compound page sizes in problematic kernel configs Let's limit the maximum folio size in problematic kernel config where the memmap is allocated per memory section (SPARSEMEM without SPARSEMEM_VMEMMAP) to a single memory section. Currently, only a single architectures supports ARCH_HAS_GIGANTIC_PAGE but not SPARSEMEM_VMEMMAP: sh. Fortunately, the biggest hugetlb size sh supports is 64 MiB (HUGETLB_PAGE_SIZE_64MB) and the section size is at least 64 MiB (SECTION_SIZE_BITS == 26), so their use case is not degraded. As folios and memory sections are naturally aligned to their order-2 size in memory, consequently a single folio can no longer span multiple memory sections on these problematic kernel configs. nth_page() is no longer required when operating within a single compound page / folio. Link: https://lkml.kernel.org/r/20250901150359.867252-12-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 77737cbf2216..2dee79fa2efc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2053,11 +2053,25 @@ static inline long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#define MAX_FOLIO_ORDER PUD_ORDER -#else +#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) +/* + * We don't expect any folios that exceed buddy sizes (and consequently + * memory sections). + */ #define MAX_FOLIO_ORDER MAX_PAGE_ORDER +#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/* + * Only pages within a single memory section are guaranteed to be + * contiguous. By limiting folios to a single memory section, all folio + * pages are guaranteed to be contiguous. + */ +#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT +#else +/* + * There is no real limit on the folio size. We limit them to the maximum we + * currently expect (e.g., hugetlb, dax). + */ +#define MAX_FOLIO_ORDER PUD_ORDER #endif #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) -- cgit v1.2.3 From 73b3294b1152e94c1971a735b8db8c7503fd97a1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:33 +0200 Subject: mm: simplify folio_page() and folio_page_idx() Now that a single folio/compound page can no longer span memory sections in problematic kernel configurations, we can stop using nth_page() in folio_page() and folio_page_idx(). While at it, turn both macros into static inline functions and add kernel doc for folio_page_idx(). Link: https://lkml.kernel.org/r/20250901150359.867252-13-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++-- include/linux/page-flags.h | 5 ++++- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2dee79fa2efc..f6880e3225c5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -210,10 +210,8 @@ extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) -#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) -#define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ @@ -225,6 +223,20 @@ extern unsigned long sysctl_admin_reserve_kbytes; /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) +/** + * folio_page_idx - Return the number of a page in a folio. + * @folio: The folio. + * @page: The folio page. + * + * This function expects that the page is actually part of the folio. + * The returned number is relative to the start of the folio. + */ +static inline unsigned long folio_page_idx(const struct folio *folio, + const struct page *page) +{ + return page - &folio->page; +} + static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d53a86e68c89..a88b61eec3f8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -316,7 +316,10 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -#define folio_page(folio, n) nth_page(&(folio)->page, n) +static inline struct page *folio_page(struct folio *folio, unsigned long n) +{ + return &folio->page + n; +} static __always_inline int PageTail(const struct page *page) { -- cgit v1.2.3 From 6972706f95926838f9bd3ec2b2393c034bdb85ba Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:43 +0200 Subject: mm/cma: refuse handing out non-contiguous page ranges Let's disallow handing out PFN ranges with non-contiguous pages, so we can remove the nth-page usage in __cma_alloc(), and so any callers don't have to worry about that either when wanting to blindly iterate pages. This is really only a problem in configs with SPARSEMEM but without SPARSEMEM_VMEMMAP, and only when we would cross memory sections in some cases. Will this cause harm? Probably not, because it's mostly 32bit that does not support SPARSEMEM_VMEMMAP. If this ever becomes a problem we could look into allocating the memmap for the memory sections spanned by a single CMA region in one go from memblock. [david@redhat.com: we can have NUMMU configs with SPARSEMEM enabled] Link: https://lkml.kernel.org/r/6ec933b1-b3f7-41c0-95d8-e518bb87375e@redhat.com Link: https://lkml.kernel.org/r/20250901150359.867252-23-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Alexandru Elisei Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++++ mm/cma.c | 39 ++++++++++++++++++++++++--------------- mm/util.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f6880e3225c5..2ca1eb2db63e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -209,9 +209,15 @@ extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +bool page_range_contiguous(const struct page *page, unsigned long nr_pages); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #else #define nth_page(page,n) ((page) + (n)) +static inline bool page_range_contiguous(const struct page *page, + unsigned long nr_pages) +{ + return true; +} #endif /* to align the pointer to the (next) page boundary */ diff --git a/mm/cma.c b/mm/cma.c index e56ec64d0567..813e6dc7b095 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -780,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, unsigned long count, unsigned int align, struct page **pagep, gfp_t gfp) { - unsigned long mask, offset; - unsigned long pfn = -1; - unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; + unsigned long start, pfn, mask, offset; int ret = -EBUSY; struct page *page = NULL; @@ -795,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, if (bitmap_count > bitmap_maxno) goto out; - for (;;) { + for (start = 0; ; start = bitmap_no + mask + 1) { spin_lock_irq(&cma->lock); /* * If the request is larger than the available number @@ -812,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, spin_unlock_irq(&cma->lock); break; } + + pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); + page = pfn_to_page(pfn); + + /* + * Do not hand out page ranges that are not contiguous, so + * callers can just iterate the pages without having to worry + * about these corner cases. + */ + if (!page_range_contiguous(page, count)) { + spin_unlock_irq(&cma->lock); + pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]", + __func__, cma->name, pfn, pfn + count - 1); + continue; + } + bitmap_set(cmr->bitmap, bitmap_no, bitmap_count); cma->available_count -= count; /* @@ -821,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, */ spin_unlock_irq(&cma->lock); - pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma->alloc_mutex); ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); - if (ret == 0) { - page = pfn_to_page(pfn); + if (!ret) break; - } cma_clear_bitmap(cma, cmr, pfn, count); if (ret != -EBUSY) break; pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n", - __func__, pfn, pfn_to_page(pfn)); + __func__, pfn, page); - trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), - count, align); - /* try again with a bit different memory target */ - start = bitmap_no + mask + 1; + trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align); } out: - *pagep = page; + if (!ret) + *pagep = page; return ret; } @@ -882,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, */ if (page) { for (i = 0; i < count; i++) - page_kasan_tag_reset(nth_page(page, i)); + page_kasan_tag_reset(page + i); } if (ret && !(gfp & __GFP_NOWARN)) { diff --git a/mm/util.c b/mm/util.c index d235b74f7aff..4b9d40c71286 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1281,3 +1281,38 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); } #endif /* CONFIG_MMU */ + +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/** + * page_range_contiguous - test whether the page range is contiguous + * @page: the start of the page range. + * @nr_pages: the number of pages in the range. + * + * Test whether the page range is contiguous, such that they can be iterated + * naively, corresponding to iterating a contiguous PFN range. + * + * This function should primarily only be used for debug checks, or when + * working with page ranges that are not naturally contiguous (e.g., pages + * within a folio are). + * + * Returns true if contiguous, otherwise false. + */ +bool page_range_contiguous(const struct page *page, unsigned long nr_pages) +{ + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + + /* + * The memmap is allocated per memory section, so no need to check + * within the first section. However, we need to check each other + * spanned memory section once, making sure the first page in a + * section could similarly be reached by just iterating pages. + */ + for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION); + pfn < end_pfn; pfn += PAGES_PER_SECTION) + if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn))) + return false; + return true; +} +#endif -- cgit v1.2.3 From 80e7bb74d4ff24725f0ddb1c72d8de45a3d975f6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:45 +0200 Subject: scatterlist: disallow non-contigous page ranges in a single SG entry The expectation is that there is currently no user that would pass in non-contigous page ranges: no allocator, not even VMA, will hand these out. The only problematic part would be if someone would provide a range obtained directly from memblock, or manually merge problematic ranges. If we find such cases, we should fix them to create separate SG entries. Let's check in sg_set_page() that this is really the case. No need to check in sg_set_folio(), as pages in a folio are guaranteed to be contiguous. As sg_set_page() gets inlined into modules, we have to export the page_range_contiguous() helper -- use EXPORT_SYMBOL, there is nothing special about this helper such that we would want to enforce GPL-only modules. We can now drop the nth_page() usage in sg_page_iter_page(). Link: https://lkml.kernel.org/r/20250901150359.867252-25-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Marek Szyprowski Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 3 ++- mm/util.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 6f8a4965f9b9..29f6ceb98d74 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -158,6 +158,7 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, unsigned int offset) { + VM_WARN_ON_ONCE(!page_range_contiguous(page, ALIGN(len + offset, PAGE_SIZE) / PAGE_SIZE)); sg_assign_page(sg, page); sg->offset = offset; sg->length = len; @@ -600,7 +601,7 @@ void __sg_page_iter_start(struct sg_page_iter *piter, */ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) { - return nth_page(sg_page(piter->sg), piter->sg_pgoffset); + return sg_page(piter->sg) + piter->sg_pgoffset; } /** diff --git a/mm/util.c b/mm/util.c index 4b9d40c71286..e29d3310e26b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1315,4 +1315,5 @@ bool page_range_contiguous(const struct page *page, unsigned long nr_pages) return false; return true; } +EXPORT_SYMBOL(page_range_contiguous); #endif -- cgit v1.2.3 From d5170ce4d71b3843613ee1840bca50ad71c3671e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:57 +0200 Subject: block: update comment of "struct bio_vec" regarding nth_page() Ever since commit 858c708d9efb ("block: move the bi_size update out of __bio_try_merge_page"), page_is_mergeable() no longer exists, and the logic in bvec_try_merge_page() is now a simple page pointer comparison. Link: https://lkml.kernel.org/r/20250901150359.867252-37-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/bvec.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 0a80e1f9aa20..3fc0efa0825b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -22,11 +22,8 @@ struct page; * @bv_len: Number of bytes in the address range. * @bv_offset: Start of the address range relative to the start of @bv_page. * - * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len: - * - * nth_page(@bv_page, n) == @bv_page + n - * - * This holds because page_is_mergeable() checks the above property. + * All pages within a bio_vec starting from @bv_page are contiguous and + * can simply be iterated (see bvec_advance()). */ struct bio_vec { struct page *bv_page; -- cgit v1.2.3 From 84efbefa26df36a845a9210ee962aa6866f99bb7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:58 +0200 Subject: mm: remove nth_page() Now that all users are gone, let's remove it. Link: https://lkml.kernel.org/r/20250901150359.867252-38-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- tools/testing/scatterlist/linux/mm.h | 1 - 2 files changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ca1eb2db63e..b26ca8b2162d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -210,9 +210,7 @@ extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) bool page_range_contiguous(const struct page *page, unsigned long nr_pages); -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #else -#define nth_page(page,n) ((page) + (n)) static inline bool page_range_contiguous(const struct page *page, unsigned long nr_pages) { diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h index 5bd9e6e80625..121ae78d6e88 100644 --- a/tools/testing/scatterlist/linux/mm.h +++ b/tools/testing/scatterlist/linux/mm.h @@ -51,7 +51,6 @@ static inline unsigned long page_to_phys(struct page *page) #define page_to_pfn(page) ((unsigned long)(page) / PAGE_SIZE) #define pfn_to_page(pfn) (void *)((pfn) * PAGE_SIZE) -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define __min(t1, t2, min1, min2, x, y) ({ \ t1 min1 = (x); \ -- cgit v1.2.3 From 4a25f995bd59843a898b531bb3e472d710ef9439 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 10 Sep 2025 21:39:56 +0800 Subject: mm: hugetlb: directly pass order when allocate a hugetlb folio Use order instead of struct hstate to remove huge_page_order() call from all hugetlb folio allocation, also order_is_gigantic() is added to check whether it is a gigantic order. Link: https://lkml.kernel.org/r/20250910133958.301467-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Oscar Salvador Reviewed-by: Sidhartha Kumar Reviewed-by: Jane Chu Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ++++++- mm/hugetlb.c | 29 ++++++++++++++--------------- mm/hugetlb_cma.c | 3 +-- mm/hugetlb_cma.h | 6 +++--- 4 files changed, 24 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 526d27e88b3b..8e63e46b8e1f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -788,9 +788,14 @@ static inline unsigned huge_page_shift(struct hstate *h) return h->order + PAGE_SHIFT; } +static inline bool order_is_gigantic(unsigned int order) +{ + return order > MAX_PAGE_ORDER; +} + static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) > MAX_PAGE_ORDER; + return order_is_gigantic(huge_page_order(h)); } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ef6284ec85b6..7f33e4a158c6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1473,17 +1473,16 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #ifdef CONFIG_CONTIG_ALLOC -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { struct folio *folio; - int order = huge_page_order(h); bool retried = false; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: - folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask); + folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask); if (!folio) { if (hugetlb_cma_exclusive_alloc()) return NULL; @@ -1506,16 +1505,16 @@ retry: } #else /* !CONFIG_CONTIG_ALLOC */ -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, + nodemask_t *nodemask) { return NULL; } #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, + nodemask_t *nodemask) { return NULL; } @@ -1926,11 +1925,9 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) return NULL; } -static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) +static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask, + int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { - int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; @@ -1980,11 +1977,13 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, nodemask_t *node_alloc_noretry) { struct folio *folio; + int order = huge_page_order(h); - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + if (order_is_gigantic(order)) + folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask); else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); + folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask, + node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; @@ -2872,7 +2871,7 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ - if (folio_order(folio) > MAX_PAGE_ORDER) + if (order_is_gigantic(folio_order(folio))) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index f58ef4969e7a..e8e4dc7182d5 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -26,11 +26,10 @@ void hugetlb_cma_free_folio(struct folio *folio) } -struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask, +struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { int node; - int order = huge_page_order(h); struct folio *folio = NULL; if (hugetlb_cma[nid]) diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h index f7d7fb9880a2..2c2ec8a7e134 100644 --- a/mm/hugetlb_cma.h +++ b/mm/hugetlb_cma.h @@ -4,7 +4,7 @@ #ifdef CONFIG_CMA void hugetlb_cma_free_folio(struct folio *folio); -struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask, +struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask); struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact); @@ -18,8 +18,8 @@ static inline void hugetlb_cma_free_folio(struct folio *folio) { } -static inline struct folio *hugetlb_cma_alloc_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nodemask) +static inline struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { return NULL; } -- cgit v1.2.3 From 8eccb066f28747e966bda716cb90dbca13b78032 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:10 +0200 Subject: mm: constify shmem related test functions for improved const-correctness Patch series "mm: establish const-correctness for pointer parameters", v6. This series is to improved const-correctness in the low-level memory-management subsystem, which provides a basis for further constification further up the call stack (e.g. filesystems). I started this work when I tried to constify the Ceph filesystem code, but found that to be impossible because many "mm" functions accept non-const pointers, even though they modify nothing. This patch (of 12): We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-1-max.kellermann@ionos.com Link: https://lkml.kernel.org/r/20250901205021.3573313-2-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- include/linux/shmem_fs.h | 4 ++-- mm/shmem.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b26ca8b2162d..45a47b555499 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -995,11 +995,11 @@ static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ -bool vma_is_shmem(struct vm_area_struct *vma); -bool vma_is_anon_shmem(struct vm_area_struct *vma); +bool vma_is_shmem(const struct vm_area_struct *vma); +bool vma_is_anon_shmem(const struct vm_area_struct *vma); #else -static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } -static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } +static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 6d0f9c599ff7..0e47465ef0fd 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -99,9 +99,9 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM -bool shmem_mapping(struct address_space *mapping); +bool shmem_mapping(const struct address_space *mapping); #else -static inline bool shmem_mapping(struct address_space *mapping) +static inline bool shmem_mapping(const struct address_space *mapping) { return false; } diff --git a/mm/shmem.c b/mm/shmem.c index 640fecc42f60..2df26f4d6e60 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -275,18 +275,18 @@ static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; -bool shmem_mapping(struct address_space *mapping) +bool shmem_mapping(const struct address_space *mapping) { return mapping->a_ops == &shmem_aops; } EXPORT_SYMBOL_GPL(shmem_mapping); -bool vma_is_anon_shmem(struct vm_area_struct *vma) +bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } -bool vma_is_shmem(struct vm_area_struct *vma) +bool vma_is_shmem(const struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } -- cgit v1.2.3 From 7c3e97ac0d75306d9d03de575c9878f8fd9efe3b Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:11 +0200 Subject: mm: constify pagemap related test/getter functions For improved const-correctness. We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-3-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 55 +++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f0dfdfb13cd9..0d66a252b06f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -140,7 +140,7 @@ static inline int inode_drain_writes(struct inode *inode) return filemap_write_and_wait(inode->i_mapping); } -static inline bool mapping_empty(struct address_space *mapping) +static inline bool mapping_empty(const struct address_space *mapping) { return xa_empty(&mapping->i_pages); } @@ -166,7 +166,7 @@ static inline bool mapping_empty(struct address_space *mapping) * refcount and the referenced bit, which will be elevated or set in * the process of adding new cache pages to an inode. */ -static inline bool mapping_shrinkable(struct address_space *mapping) +static inline bool mapping_shrinkable(const struct address_space *mapping) { void *head; @@ -267,7 +267,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping) clear_bit(AS_UNEVICTABLE, &mapping->flags); } -static inline bool mapping_unevictable(struct address_space *mapping) +static inline bool mapping_unevictable(const struct address_space *mapping) { return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags); } @@ -277,7 +277,7 @@ static inline void mapping_set_exiting(struct address_space *mapping) set_bit(AS_EXITING, &mapping->flags); } -static inline int mapping_exiting(struct address_space *mapping) +static inline int mapping_exiting(const struct address_space *mapping) { return test_bit(AS_EXITING, &mapping->flags); } @@ -287,7 +287,7 @@ static inline void mapping_set_no_writeback_tags(struct address_space *mapping) set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } -static inline int mapping_use_writeback_tags(struct address_space *mapping) +static inline int mapping_use_writeback_tags(const struct address_space *mapping) { return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } @@ -333,7 +333,7 @@ static inline void mapping_set_inaccessible(struct address_space *mapping) set_bit(AS_INACCESSIBLE, &mapping->flags); } -static inline bool mapping_inaccessible(struct address_space *mapping) +static inline bool mapping_inaccessible(const struct address_space *mapping) { return test_bit(AS_INACCESSIBLE, &mapping->flags); } @@ -343,18 +343,18 @@ static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_ set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline bool mapping_writeback_may_deadlock_on_reclaim(struct address_space *mapping) +static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct address_space *mapping) { return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline gfp_t mapping_gfp_mask(struct address_space * mapping) +static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; } /* Restricts the given gfp_mask to what the mapping allows. */ -static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, +static inline gfp_t mapping_gfp_constraint(const struct address_space *mapping, gfp_t gfp_mask) { return mapping_gfp_mask(mapping) & gfp_mask; @@ -477,7 +477,7 @@ mapping_min_folio_order(const struct address_space *mapping) } static inline unsigned long -mapping_min_folio_nrpages(struct address_space *mapping) +mapping_min_folio_nrpages(const struct address_space *mapping) { return 1UL << mapping_min_folio_order(mapping); } @@ -491,7 +491,7 @@ mapping_min_folio_nrpages(struct address_space *mapping) * new folio to the page cache and need to know what index to give it, * call this function. */ -static inline pgoff_t mapping_align_index(struct address_space *mapping, +static inline pgoff_t mapping_align_index(const struct address_space *mapping, pgoff_t index) { return round_down(index, mapping_min_folio_nrpages(mapping)); @@ -501,7 +501,7 @@ static inline pgoff_t mapping_align_index(struct address_space *mapping, * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. */ -static inline bool mapping_large_folio_support(struct address_space *mapping) +static inline bool mapping_large_folio_support(const struct address_space *mapping) { /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON, @@ -516,7 +516,7 @@ static inline size_t mapping_max_folio_size(const struct address_space *mapping) return PAGE_SIZE << mapping_max_folio_order(mapping); } -static inline int filemap_nr_thps(struct address_space *mapping) +static inline int filemap_nr_thps(const struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS return atomic_read(&mapping->nr_thps); @@ -930,7 +930,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, * * Return: The index of the folio which follows this folio in the file. */ -static inline pgoff_t folio_next_index(struct folio *folio) +static inline pgoff_t folio_next_index(const struct folio *folio) { return folio->index + folio_nr_pages(folio); } @@ -959,7 +959,7 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) * e.g., shmem did not move this folio to the swap cache. * Return: true or false. */ -static inline bool folio_contains(struct folio *folio, pgoff_t index) +static inline bool folio_contains(const struct folio *folio, pgoff_t index) { VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); return index - folio->index < folio_nr_pages(folio); @@ -1036,13 +1036,13 @@ static inline loff_t page_offset(struct page *page) /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ -static inline pgoff_t folio_pgoff(struct folio *folio) +static inline pgoff_t folio_pgoff(const struct folio *folio) { return folio->index; } -static inline pgoff_t linear_page_index(struct vm_area_struct *vma, - unsigned long address) +static inline pgoff_t linear_page_index(const struct vm_area_struct *vma, + const unsigned long address) { pgoff_t pgoff; pgoff = (address - vma->vm_start) >> PAGE_SHIFT; @@ -1462,7 +1462,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. */ -static inline loff_t readahead_pos(struct readahead_control *rac) +static inline loff_t readahead_pos(const struct readahead_control *rac) { return (loff_t)rac->_index * PAGE_SIZE; } @@ -1471,7 +1471,7 @@ static inline loff_t readahead_pos(struct readahead_control *rac) * readahead_length - The number of bytes in this readahead request. * @rac: The readahead request. */ -static inline size_t readahead_length(struct readahead_control *rac) +static inline size_t readahead_length(const struct readahead_control *rac) { return rac->_nr_pages * PAGE_SIZE; } @@ -1480,7 +1480,7 @@ static inline size_t readahead_length(struct readahead_control *rac) * readahead_index - The index of the first page in this readahead request. * @rac: The readahead request. */ -static inline pgoff_t readahead_index(struct readahead_control *rac) +static inline pgoff_t readahead_index(const struct readahead_control *rac) { return rac->_index; } @@ -1489,7 +1489,7 @@ static inline pgoff_t readahead_index(struct readahead_control *rac) * readahead_count - The number of pages in this readahead request. * @rac: The readahead request. */ -static inline unsigned int readahead_count(struct readahead_control *rac) +static inline unsigned int readahead_count(const struct readahead_control *rac) { return rac->_nr_pages; } @@ -1498,12 +1498,12 @@ static inline unsigned int readahead_count(struct readahead_control *rac) * readahead_batch_length - The number of bytes in the current batch. * @rac: The readahead request. */ -static inline size_t readahead_batch_length(struct readahead_control *rac) +static inline size_t readahead_batch_length(const struct readahead_control *rac) { return rac->_batch_count * PAGE_SIZE; } -static inline unsigned long dir_pages(struct inode *inode) +static inline unsigned long dir_pages(const struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -1517,8 +1517,8 @@ static inline unsigned long dir_pages(struct inode *inode) * Return: the number of bytes in the folio up to EOF, * or -EFAULT if the folio was truncated. */ -static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, - struct inode *inode) +static inline ssize_t folio_mkwrite_check_truncate(const struct folio *folio, + const struct inode *inode) { loff_t size = i_size_read(inode); pgoff_t index = size >> PAGE_SHIFT; @@ -1549,7 +1549,8 @@ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, * Return: The number of filesystem blocks covered by this folio. */ static inline -unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio) +unsigned int i_blocks_per_folio(const struct inode *inode, + const struct folio *folio) { return folio_size(folio) >> inode->i_blkbits; } -- cgit v1.2.3 From 959b0886256b6896b44633e0e07c5464169087c1 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:12 +0200 Subject: mm: constify zone related test/getter functions For improved const-correctness. We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-4-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f3272ef5131b..6c4eae96160d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1104,7 +1104,7 @@ static inline unsigned long promo_wmark_pages(const struct zone *z) return wmark_pages(z, WMARK_PROMO); } -static inline unsigned long zone_managed_pages(struct zone *zone) +static inline unsigned long zone_managed_pages(const struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } @@ -1128,12 +1128,12 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } -static inline bool zone_is_initialized(struct zone *zone) +static inline bool zone_is_initialized(const struct zone *zone) { return zone->initialized; } -static inline bool zone_is_empty(struct zone *zone) +static inline bool zone_is_empty(const struct zone *zone) { return zone->spanned_pages == 0; } @@ -1273,7 +1273,7 @@ static inline bool folio_is_zone_movable(const struct folio *folio) * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ -static inline bool zone_intersects(struct zone *zone, +static inline bool zone_intersects(const struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) @@ -1581,12 +1581,12 @@ static inline int local_memory_node(int node_id) { return node_id; }; #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) #ifdef CONFIG_ZONE_DEVICE -static inline bool zone_is_zone_device(struct zone *zone) +static inline bool zone_is_zone_device(const struct zone *zone) { return zone_idx(zone) == ZONE_DEVICE; } #else -static inline bool zone_is_zone_device(struct zone *zone) +static inline bool zone_is_zone_device(const struct zone *zone) { return false; } @@ -1598,19 +1598,19 @@ static inline bool zone_is_zone_device(struct zone *zone) * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ -static inline bool managed_zone(struct zone *zone) +static inline bool managed_zone(const struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ -static inline bool populated_zone(struct zone *zone) +static inline bool populated_zone(const struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NUMA -static inline int zone_to_nid(struct zone *zone) +static inline int zone_to_nid(const struct zone *zone) { return zone->node; } @@ -1620,7 +1620,7 @@ static inline void zone_set_nid(struct zone *zone, int nid) zone->node = nid; } #else -static inline int zone_to_nid(struct zone *zone) +static inline int zone_to_nid(const struct zone *zone) { return 0; } @@ -1647,7 +1647,7 @@ static inline int is_highmem_idx(enum zone_type idx) * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise */ -static inline int is_highmem(struct zone *zone) +static inline int is_highmem(const struct zone *zone) { return is_highmem_idx(zone_idx(zone)); } @@ -1713,12 +1713,12 @@ static inline struct zone *zonelist_zone(struct zoneref *zoneref) return zoneref->zone; } -static inline int zonelist_zone_idx(struct zoneref *zoneref) +static inline int zonelist_zone_idx(const struct zoneref *zoneref) { return zoneref->zone_idx; } -static inline int zonelist_node_idx(struct zoneref *zoneref) +static inline int zonelist_node_idx(const struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } @@ -2021,7 +2021,7 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section) return (struct page *)map; } -static inline int present_section(struct mem_section *section) +static inline int present_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } @@ -2031,12 +2031,12 @@ static inline int present_section_nr(unsigned long nr) return present_section(__nr_to_section(nr)); } -static inline int valid_section(struct mem_section *section) +static inline int valid_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } -static inline int early_section(struct mem_section *section) +static inline int early_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } @@ -2046,27 +2046,27 @@ static inline int valid_section_nr(unsigned long nr) return valid_section(__nr_to_section(nr)); } -static inline int online_section(struct mem_section *section) +static inline int online_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } #ifdef CONFIG_ZONE_DEVICE -static inline int online_device_section(struct mem_section *section) +static inline int online_device_section(const struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } #else -static inline int online_device_section(struct mem_section *section) +static inline int online_device_section(const struct mem_section *section) { return 0; } #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT -static inline int preinited_vmemmap_section(struct mem_section *section) +static inline int preinited_vmemmap_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); @@ -2076,7 +2076,7 @@ void sparse_vmemmap_init_nid_early(int nid); void sparse_vmemmap_init_nid_late(int nid); #else -static inline int preinited_vmemmap_section(struct mem_section *section) +static inline int preinited_vmemmap_section(const struct mem_section *section) { return 0; } -- cgit v1.2.3 From b119fb0927738f150cbd179d23d08057dccd75c1 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:13 +0200 Subject: fs: constify mapping related test functions for improved const-correctness We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-5-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..0783c5d05d3f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -537,7 +537,7 @@ struct address_space { /* * Returns true if any of the pages in the mapping are marked with the tag. */ -static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) +static inline bool mapping_tagged(const struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } @@ -585,7 +585,7 @@ static inline void i_mmap_assert_write_locked(struct address_space *mapping) /* * Might pages of this file be mapped into userspace? */ -static inline int mapping_mapped(struct address_space *mapping) +static inline int mapping_mapped(const struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } @@ -599,7 +599,7 @@ static inline int mapping_mapped(struct address_space *mapping) * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ -static inline int mapping_writably_mapped(struct address_space *mapping) +static inline int mapping_writably_mapped(const struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } -- cgit v1.2.3 From 4680092f8ccb4406e771a6b1a2c0243ebd40bab7 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:14 +0200 Subject: mm: constify process_shares_mm() for improved const-correctness This function only reads from the pointer arguments. Local (loop) variables are also annotated with `const` to clarify that these will not be written to. Link: https://lkml.kernel.org/r/20250901205021.3573313-6-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/oom_kill.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 45a47b555499..b3b63058e1a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3872,7 +3872,7 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ -extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); +bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm); void drop_slab(void); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 17650f0b516e..58bd4cf71d52 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -490,12 +490,12 @@ static bool oom_killer_disabled __read_mostly; * task's threads: if one of those is using this mm then this task was also * using it. */ -bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm) { - struct task_struct *t; + const struct task_struct *t; for_each_thread(p, t) { - struct mm_struct *t_mm = READ_ONCE(t->mm); + const struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm) return t_mm == mm; } -- cgit v1.2.3 From 0bf25cfc9e795ab302ee23550fdeebd2aeedf800 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:15 +0200 Subject: mm, s390: constify mapping related test/getter functions For improved const-correctness. We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. (Even though seemingly unrelated, this also constifies the pointer parameter of mmap_is_legacy() in arch/s390/mm/mmap.c because a copy of the function exists in mm/util.c.) Link: https://lkml.kernel.org/r/20250901205021.3573313-7-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/s390/mm/mmap.c | 2 +- include/linux/mm.h | 6 +++--- include/linux/pagemap.h | 2 +- mm/util.c | 10 +++++----- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 547104ccc22a..e188cb6d4946 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -27,7 +27,7 @@ static unsigned long stack_maxrandom_size(void) return STACK_RND_MASK << PAGE_SHIFT; } -static inline int mmap_is_legacy(struct rlimit *rlim_stack) +static inline int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; diff --git a/include/linux/mm.h b/include/linux/mm.h index b3b63058e1a3..221e98bb7689 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1002,7 +1002,7 @@ static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; } #endif -int vma_is_stack_for_current(struct vm_area_struct *vma); +int vma_is_stack_for_current(const struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } @@ -2617,7 +2617,7 @@ void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, - struct task_struct *task, bool bypass_rlim); + const struct task_struct *task, bool bypass_rlim); struct kvec; struct page *get_dump_page(unsigned long addr, int *locked); @@ -3380,7 +3380,7 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) /* mmap.c */ -extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); +extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0d66a252b06f..aec4a11565bc 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -545,7 +545,7 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -struct address_space *folio_mapping(struct folio *); +struct address_space *folio_mapping(const struct folio *folio); /** * folio_flush_mapping - Find the file mapping this folio belongs to. diff --git a/mm/util.c b/mm/util.c index e29d3310e26b..391f6e7daf83 100644 --- a/mm/util.c +++ b/mm/util.c @@ -315,7 +315,7 @@ void *memdup_user_nul(const void __user *src, size_t len) EXPORT_SYMBOL(memdup_user_nul); /* Check if the vma is being used as a stack by this task */ -int vma_is_stack_for_current(struct vm_area_struct *vma) +int vma_is_stack_for_current(const struct vm_area_struct *vma) { struct task_struct * __maybe_unused t = current; @@ -410,7 +410,7 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -static int mmap_is_legacy(struct rlimit *rlim_stack) +static int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; @@ -504,7 +504,7 @@ EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, - struct task_struct *task, bool bypass_rlim) + const struct task_struct *task, bool bypass_rlim) { unsigned long locked_vm, limit; int ret = 0; @@ -688,7 +688,7 @@ struct anon_vma *folio_anon_vma(const struct folio *folio) * You can call this for folios which aren't in the swap cache or page * cache and it will return NULL. */ -struct address_space *folio_mapping(struct folio *folio) +struct address_space *folio_mapping(const struct folio *folio) { struct address_space *mapping; @@ -926,7 +926,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; unsigned long bytes_failed; -- cgit v1.2.3 From a955cca37288fe37cc1cde8d291e02717c8a7409 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:17 +0200 Subject: mm: constify arch_pick_mmap_layout() for improved const-correctness This function only reads from the rlimit pointer (but writes to the mm_struct pointer which is kept without `const`). All callees are already const-ified or (internal functions) are being constified by this patch. Link: https://lkml.kernel.org/r/20250901205021.3573313-9-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/s390/mm/mmap.c | 4 ++-- arch/sparc/kernel/sys_sparc_64.c | 2 +- arch/x86/mm/mmap.c | 6 +++--- include/linux/sched/mm.h | 4 ++-- mm/util.c | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index e188cb6d4946..197c1d9497a7 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -47,7 +47,7 @@ static unsigned long mmap_base_legacy(unsigned long rnd) } static inline unsigned long mmap_base(unsigned long rnd, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size() + stack_guard_gap; @@ -169,7 +169,7 @@ check_asce_limit: * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 785e9909340f..55faf2effa46 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -294,7 +294,7 @@ static unsigned long mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = mmap_rnd(); unsigned long gap; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 708f85dc9380..82f3a987f7cf 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -80,7 +80,7 @@ unsigned long arch_mmap_rnd(void) } static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; @@ -110,7 +110,7 @@ static unsigned long mmap_legacy_base(unsigned long rnd, */ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, unsigned long random_factor, unsigned long task_size, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { *legacy_base = mmap_legacy_base(random_factor, task_size); if (mmap_is_legacy()) @@ -119,7 +119,7 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, *base = mmap_base(random_factor, task_size, rlim_stack); } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { if (mmap_is_legacy()) mm_flags_clear(MMF_TOPDOWN, mm); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2201da0afecc..0232d983b715 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -178,7 +178,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif extern void arch_pick_mmap_layout(struct mm_struct *mm, - struct rlimit *rlim_stack); + const struct rlimit *rlim_stack); unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, @@ -211,7 +211,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, - struct rlimit *rlim_stack) {} + const struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) diff --git a/mm/util.c b/mm/util.c index 391f6e7daf83..732a2dfcaec7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -431,7 +431,7 @@ static int mmap_is_legacy(const struct rlimit *rlim_stack) #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP / 6 * 5) -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) +static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack) { #ifdef CONFIG_STACK_GROWSUP /* @@ -462,7 +462,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) #endif } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -478,7 +478,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm_flags_clear(MMF_TOPDOWN, mm); -- cgit v1.2.3 From 89bf840b84bb53393436426cd4acd80604bd26fd Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:18 +0200 Subject: mm: constify ptdesc_pmd_pts_count() and folio_get_private() These functions from mm_types.h are trivial getters that should never write to the given pointers. Link: https://lkml.kernel.org/r/20250901205021.3573313-10-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d934a3a5b443..275e8060d918 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -632,7 +632,7 @@ static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) atomic_dec(&ptdesc->pt_share_count); } -static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +static inline int ptdesc_pmd_pts_count(const struct ptdesc *ptdesc) { return atomic_read(&ptdesc->pt_share_count); } @@ -660,7 +660,7 @@ static inline void set_page_private(struct page *page, unsigned long private) page->private = private; } -static inline void *folio_get_private(struct folio *folio) +static inline void *folio_get_private(const struct folio *folio) { return folio->private; } -- cgit v1.2.3 From f346a9473a2fbbab785d1733d475160f1fc54e5a Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:19 +0200 Subject: mm: constify various inline functions for improved const-correctness We select certain test functions plus folio_migrate_refs() from mm_inline.h which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. One exception is the function folio_migrate_refs() which does write to the "new" folio pointer; there, only the "old" folio pointer is being constified; only its "flags" field is read, but nothing written. Link: https://lkml.kernel.org/r/20250901205021.3573313-11-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 150302b4a905..d6c1011b38f2 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -25,7 +25,7 @@ * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ -static inline int folio_is_file_lru(struct folio *folio) +static inline int folio_is_file_lru(const struct folio *folio) { return !folio_test_swapbacked(folio); } @@ -84,7 +84,7 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio) * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ -static __always_inline enum lru_list folio_lru_list(struct folio *folio) +static __always_inline enum lru_list folio_lru_list(const struct folio *folio) { enum lru_list lru; @@ -141,7 +141,7 @@ static inline int lru_tier_from_refs(int refs, bool workingset) return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } -static inline int folio_lru_refs(struct folio *folio) +static inline int folio_lru_refs(const struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags.f); @@ -154,14 +154,14 @@ static inline int folio_lru_refs(struct folio *folio) return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } -static inline int folio_lru_gen(struct folio *folio) +static inline int folio_lru_gen(const struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags.f); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } -static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) +static inline bool lru_gen_is_active(const struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; @@ -217,12 +217,13 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } -static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, +static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec, + const struct folio *folio, bool reclaiming) { int gen; int type = folio_is_file_lru(folio); - struct lru_gen_folio *lrugen = &lruvec->lrugen; + const struct lru_gen_folio *lrugen = &lruvec->lrugen; /* * +-----------------------------------+-----------------------------------+ @@ -302,7 +303,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return true; } -static inline void folio_migrate_refs(struct folio *new, struct folio *old) +static inline void folio_migrate_refs(struct folio *new, const struct folio *old) { unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK; @@ -330,7 +331,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return false; } -static inline void folio_migrate_refs(struct folio *new, struct folio *old) +static inline void folio_migrate_refs(struct folio *new, const struct folio *old) { } @@ -508,7 +509,7 @@ static inline void dec_tlb_flush_pending(struct mm_struct *mm) atomic_dec(&mm->tlb_flush_pending); } -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +static inline bool mm_tlb_flush_pending(const struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that @@ -521,7 +522,7 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending); } -static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +static inline bool mm_tlb_flush_nested(const struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL @@ -605,7 +606,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, return false; } -static inline bool vma_has_recency(struct vm_area_struct *vma) +static inline bool vma_has_recency(const struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; -- cgit v1.2.3 From da0045587d59d4ffd7710fa45cea51e5a48453a4 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:20 +0200 Subject: mm: constify assert/test functions in mm.h For improved const-correctness. We select certain assert and test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-12-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 221e98bb7689..a6bfa46937a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -719,7 +719,7 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } -static inline void assert_fault_locked(struct vm_fault *vmf) +static inline void assert_fault_locked(const struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); @@ -732,7 +732,7 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } -static inline void assert_fault_locked(struct vm_fault *vmf) +static inline void assert_fault_locked(const struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } @@ -875,7 +875,7 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) vma->vm_end >= vma->vm_mm->start_stack; } -static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) +static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -889,7 +889,7 @@ static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) return false; } -static inline bool vma_is_foreign(struct vm_area_struct *vma) +static inline bool vma_is_foreign(const struct vm_area_struct *vma) { if (!current->mm) return true; @@ -900,7 +900,7 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma) return false; } -static inline bool vma_is_accessible(struct vm_area_struct *vma) +static inline bool vma_is_accessible(const struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } @@ -911,7 +911,7 @@ static inline bool is_shared_maywrite(vm_flags_t vm_flags) (VM_SHARED | VM_MAYWRITE); } -static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) { return is_shared_maywrite(vma->vm_flags); } @@ -1855,7 +1855,7 @@ static inline struct folio *pfn_folio(unsigned long pfn) } #ifdef CONFIG_MMU -static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) +static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot) { return pfn_pte(page_to_pfn(page), pgprot); } @@ -1870,7 +1870,7 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) +static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot) { return pfn_pte(folio_pfn(folio), pgprot); } @@ -1886,7 +1886,7 @@ static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) +static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot) { return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); } @@ -1902,7 +1902,7 @@ static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pud_t folio_mk_pud(struct folio *folio, pgprot_t pgprot) +static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot) { return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot)); } @@ -3520,7 +3520,7 @@ struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) return mtree_load(&mm->mm_mt, addr); } -static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) +static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma) { if (vma->vm_flags & VM_GROWSDOWN) return stack_guard_gap; @@ -3532,7 +3532,7 @@ static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) return 0; } -static inline unsigned long vm_start_gap(struct vm_area_struct *vma) +static inline unsigned long vm_start_gap(const struct vm_area_struct *vma) { unsigned long gap = stack_guard_start_gap(vma); unsigned long vm_start = vma->vm_start; @@ -3543,7 +3543,7 @@ static inline unsigned long vm_start_gap(struct vm_area_struct *vma) return vm_start; } -static inline unsigned long vm_end_gap(struct vm_area_struct *vma) +static inline unsigned long vm_end_gap(const struct vm_area_struct *vma) { unsigned long vm_end = vma->vm_end; @@ -3555,7 +3555,7 @@ static inline unsigned long vm_end_gap(struct vm_area_struct *vma) return vm_end; } -static inline unsigned long vma_pages(struct vm_area_struct *vma) +static inline unsigned long vma_pages(const struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } @@ -3572,7 +3572,7 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } -static inline bool range_in_vma(struct vm_area_struct *vma, +static inline bool range_in_vma(const struct vm_area_struct *vma, unsigned long start, unsigned long end) { return (vma && vma->vm_start <= start && end <= vma->vm_end); @@ -3688,7 +3688,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ -static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, +static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma, unsigned int flags) { /* @@ -3818,7 +3818,7 @@ static inline bool debug_guardpage_enabled(void) return static_branch_unlikely(&_debug_guardpage_enabled); } -static inline bool page_is_guard(struct page *page) +static inline bool page_is_guard(const struct page *page) { if (!debug_guardpage_enabled()) return false; @@ -3849,7 +3849,7 @@ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } -static inline bool page_is_guard(struct page *page) { return false; } +static inline bool page_is_guard(const struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -3931,7 +3931,7 @@ void vmemmap_free(unsigned long start, unsigned long end, #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ if (altmap) @@ -3945,7 +3945,7 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap, altmap->alloc -= nr_pfns; } #else -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) { return 0; } -- cgit v1.2.3 From a847b17009ec271514b269c90320a3893cd9b667 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:21 +0200 Subject: mm: constify highmem related functions for improved const-correctness Lots of functions in mm/highmem.c do not write to the given pointers and do not call functions that take non-const pointers and can therefore be constified. This includes functions like kunmap() which might be implemented in a way that writes to the pointer (e.g. to update reference counters or mapping fields), but currently are not. kmap() on the other hand cannot be made const because it calls set_page_address() which is non-const in some architectures/configurations. [akpm@linux-foundation.org: "fix" folio_page() build failure] Link: https://lkml.kernel.org/r/20250901205021.3573313-13-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/arm/include/asm/highmem.h | 6 +++--- arch/xtensa/include/asm/highmem.h | 2 +- include/linux/highmem-internal.h | 36 ++++++++++++++++++------------------ include/linux/highmem.h | 8 ++++---- include/linux/page-flags.h | 4 ++-- mm/highmem.c | 10 +++++----- 6 files changed, 33 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index b4b66220952d..bdb209e002a4 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -46,9 +46,9 @@ extern pte_t *pkmap_page_table; #endif #ifdef ARCH_NEEDS_KMAP_HIGH_GET -extern void *kmap_high_get(struct page *page); +extern void *kmap_high_get(const struct page *page); -static inline void *arch_kmap_local_high_get(struct page *page) +static inline void *arch_kmap_local_high_get(const struct page *page) { if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !cache_is_vivt()) return NULL; @@ -57,7 +57,7 @@ static inline void *arch_kmap_local_high_get(struct page *page) #define arch_kmap_local_high_get arch_kmap_local_high_get #else /* ARCH_NEEDS_KMAP_HIGH_GET */ -static inline void *kmap_high_get(struct page *page) +static inline void *kmap_high_get(const struct page *page) { return NULL; } diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 34b8b620e7f1..b55235f4adac 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -29,7 +29,7 @@ #if DCACHE_WAY_SIZE > PAGE_SIZE #define get_pkmap_color get_pkmap_color -static inline int get_pkmap_color(struct page *page) +static inline int get_pkmap_color(const struct page *page) { return DCACHE_ALIAS(page_to_phys(page)); } diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 36053c3d6d64..0574c21ca45d 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -7,7 +7,7 @@ */ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); -void *__kmap_local_page_prot(struct page *page, pgprot_t prot); +void *__kmap_local_page_prot(const struct page *page, pgprot_t prot); void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); @@ -33,7 +33,7 @@ static inline void kmap_flush_tlb(unsigned long addr) { } #endif void *kmap_high(struct page *page); -void kunmap_high(struct page *page); +void kunmap_high(const struct page *page); void __kmap_flush_unused(void); struct page *__kmap_to_page(void *addr); @@ -50,7 +50,7 @@ static inline void *kmap(struct page *page) return addr; } -static inline void kunmap(struct page *page) +static inline void kunmap(const struct page *page) { might_sleep(); if (!PageHighMem(page)) @@ -68,12 +68,12 @@ static inline void kmap_flush_unused(void) __kmap_flush_unused(); } -static inline void *kmap_local_page(struct page *page) +static inline void *kmap_local_page(const struct page *page) { return __kmap_local_page_prot(page, kmap_prot); } -static inline void *kmap_local_page_try_from_panic(struct page *page) +static inline void *kmap_local_page_try_from_panic(const struct page *page) { if (!PageHighMem(page)) return page_address(page); @@ -81,13 +81,13 @@ static inline void *kmap_local_page_try_from_panic(struct page *page) return NULL; } -static inline void *kmap_local_folio(struct folio *folio, size_t offset) +static inline void *kmap_local_folio(const struct folio *folio, size_t offset) { - struct page *page = folio_page(folio, offset / PAGE_SIZE); + const struct page *page = folio_page(folio, offset / PAGE_SIZE); return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; } -static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); } @@ -102,7 +102,7 @@ static inline void __kunmap_local(const void *vaddr) kunmap_local_indexed(vaddr); } -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); @@ -113,7 +113,7 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return __kmap_local_page_prot(page, prot); } -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic(const struct page *page) { return kmap_atomic_prot(page, kmap_prot); } @@ -173,32 +173,32 @@ static inline void *kmap(struct page *page) return page_address(page); } -static inline void kunmap_high(struct page *page) { } +static inline void kunmap_high(const struct page *page) { } static inline void kmap_flush_unused(void) { } -static inline void kunmap(struct page *page) +static inline void kunmap(const struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } -static inline void *kmap_local_page(struct page *page) +static inline void *kmap_local_page(const struct page *page) { return page_address(page); } -static inline void *kmap_local_page_try_from_panic(struct page *page) +static inline void *kmap_local_page_try_from_panic(const struct page *page) { return page_address(page); } -static inline void *kmap_local_folio(struct folio *folio, size_t offset) +static inline void *kmap_local_folio(const struct folio *folio, size_t offset) { return folio_address(folio) + offset; } -static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot) { return kmap_local_page(page); } @@ -215,7 +215,7 @@ static inline void __kunmap_local(const void *addr) #endif } -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic(const struct page *page) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); @@ -225,7 +225,7 @@ static inline void *kmap_atomic(struct page *page) return page_address(page); } -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot) { return kmap_atomic(page); } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6234f316468c..105cc4c00cc3 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -43,7 +43,7 @@ static inline void *kmap(struct page *page); * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ -static inline void kunmap(struct page *page); +static inline void kunmap(const struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address @@ -93,7 +93,7 @@ static inline void kmap_flush_unused(void); * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ -static inline void *kmap_local_page(struct page *page); +static inline void *kmap_local_page(const struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage @@ -129,7 +129,7 @@ static inline void *kmap_local_page(struct page *page); * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ -static inline void *kmap_local_folio(struct folio *folio, size_t offset); +static inline void *kmap_local_folio(const struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! @@ -176,7 +176,7 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ -static inline void *kmap_atomic(struct page *page); +static inline void *kmap_atomic(const struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a88b61eec3f8..568011930e35 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -316,9 +316,9 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -static inline struct page *folio_page(struct folio *folio, unsigned long n) +static inline struct page *folio_page(const struct folio *folio, unsigned long n) { - return &folio->page + n; + return (struct page *)(&folio->page + n); } static __always_inline int PageTail(const struct page *page) diff --git a/mm/highmem.c b/mm/highmem.c index ef3189b36cad..b5c8e4c2d5d4 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -61,7 +61,7 @@ static inline int kmap_local_calc_idx(int idx) /* * Determine color of virtual address where the page should be mapped. */ -static inline unsigned int get_pkmap_color(struct page *page) +static inline unsigned int get_pkmap_color(const struct page *page) { return 0; } @@ -334,7 +334,7 @@ EXPORT_SYMBOL(kmap_high); * * This can be called from any context. */ -void *kmap_high_get(struct page *page) +void *kmap_high_get(const struct page *page) { unsigned long vaddr, flags; @@ -356,7 +356,7 @@ void *kmap_high_get(struct page *page) * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called * only from user context. */ -void kunmap_high(struct page *page) +void kunmap_high(const struct page *page) { unsigned long vaddr; unsigned long nr; @@ -508,7 +508,7 @@ static inline void kmap_local_idx_pop(void) #endif #ifndef arch_kmap_local_high_get -static inline void *arch_kmap_local_high_get(struct page *page) +static inline void *arch_kmap_local_high_get(const struct page *page) { return NULL; } @@ -572,7 +572,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) } EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); -void *__kmap_local_page_prot(struct page *page, pgprot_t prot) +void *__kmap_local_page_prot(const struct page *page, pgprot_t prot) { void *kmap; -- cgit v1.2.3 From 9fd53c8122271d9fe8b687f50a9bdf5588d41d0b Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Fri, 11 Jul 2025 13:55:09 +0800 Subject: mm/filemap: align last_index to folio size On XFS systems with pagesize=4K, blocksize=16K, and CONFIG_TRANSPARENT_HUGEPAGE enabled, We observed the following readahead behaviors: # echo 3 > /proc/sys/vm/drop_caches # dd if=test of=/dev/null bs=64k count=1 # ./tools/mm/page-types -r -L -f /mnt/xfs/test foffset offset flags 0 136d4c __RU_l_________H______t_________________F_1 1 136d4d __RU_l__________T_____t_________________F_1 2 136d4e __RU_l__________T_____t_________________F_1 3 136d4f __RU_l__________T_____t_________________F_1 ... c 136bb8 __RU_l_________H______t_________________F_1 d 136bb9 __RU_l__________T_____t_________________F_1 e 136bba __RU_l__________T_____t_________________F_1 f 136bbb __RU_l__________T_____t_________________F_1 <-- first read 10 13c2cc ___U_l_________H______t______________I__F_1 <-- readahead flag 11 13c2cd ___U_l__________T_____t______________I__F_1 12 13c2ce ___U_l__________T_____t______________I__F_1 13 13c2cf ___U_l__________T_____t______________I__F_1 ... 1c 1405d4 ___U_l_________H______t_________________F_1 1d 1405d5 ___U_l__________T_____t_________________F_1 1e 1405d6 ___U_l__________T_____t_________________F_1 1f 1405d7 ___U_l__________T_____t_________________F_1 [ra_size = 32, req_count = 16, async_size = 16] # echo 3 > /proc/sys/vm/drop_caches # dd if=test of=/dev/null bs=60k count=1 # ./page-types -r -L -f /mnt/xfs/test foffset offset flags 0 136048 __RU_l_________H______t_________________F_1 ... c 110a40 __RU_l_________H______t_________________F_1 d 110a41 __RU_l__________T_____t_________________F_1 e 110a42 __RU_l__________T_____t_________________F_1 <-- first read f 110a43 __RU_l__________T_____t_________________F_1 <-- first readahead flag 10 13e7a8 ___U_l_________H______t_________________F_1 ... 20 137a00 ___U_l_________H______t_______P______I__F_1 <-- second readahead flag (20 - 2f) 21 137a01 ___U_l__________T_____t_______P______I__F_1 ... 3f 10d4af ___U_l__________T_____t_______P_________F_1 [first readahead: ra_size = 32, req_count = 15, async_size = 17] When reading 64k data (same for 61-63k range, where last_index is page-aligned in filemap_get_pages()), 128k readahead is triggered via page_cache_sync_ra() and the PG_readahead flag is set on the next folio (the one containing 0x10 page). When reading 60k data, 128k readahead is also triggered via page_cache_sync_ra(). However, in this case the readahead flag is set on the 0xf page. Although the requested read size (req_count) is 60k, the actual read will be aligned to folio size (64k), which triggers the readahead flag and initiates asynchronous readahead via page_cache_async_ra(). This results in two readahead operations totaling 256k. The root cause is that when the requested size is smaller than the actual read size (due to folio alignment), it triggers asynchronous readahead. By changing last_index alignment from page size to folio size, we ensure the requested size matches the actual read size, preventing the case where a single read operation triggers two readahead operations. After applying the patch: # echo 3 > /proc/sys/vm/drop_caches # dd if=test of=/dev/null bs=60k count=1 # ./page-types -r -L -f /mnt/xfs/test foffset offset flags 0 136d4c __RU_l_________H______t_________________F_1 1 136d4d __RU_l__________T_____t_________________F_1 2 136d4e __RU_l__________T_____t_________________F_1 3 136d4f __RU_l__________T_____t_________________F_1 ... c 136bb8 __RU_l_________H______t_________________F_1 d 136bb9 __RU_l__________T_____t_________________F_1 e 136bba __RU_l__________T_____t_________________F_1 <-- first read f 136bbb __RU_l__________T_____t_________________F_1 10 13c2cc ___U_l_________H______t______________I__F_1 <-- readahead flag 11 13c2cd ___U_l__________T_____t______________I__F_1 12 13c2ce ___U_l__________T_____t______________I__F_1 13 13c2cf ___U_l__________T_____t______________I__F_1 ... 1c 1405d4 ___U_l_________H______t_________________F_1 1d 1405d5 ___U_l__________T_____t_________________F_1 1e 1405d6 ___U_l__________T_____t_________________F_1 1f 1405d7 ___U_l__________T_____t_________________F_1 [ra_size = 32, req_count = 16, async_size = 16] The same phenomenon will occur when reading from 49k to 64k. Set the readahead flag to the next folio. Because the minimum order of folio in address_space equals the block size (at least in xfs and bcachefs that already support bs > ps), having request_count aligned to block size will not cause overread. [klarasmodin@gmail.com: fix overflow on 32-bit] Link: https://lkml.kernel.org/r/yru7qf5gvyzccq5ohhpylvxug5lr5tf54omspbjh4sm6pcdb2r@fpjgj2pxw7va [akpm@linux-foundation.org: update it for Max's constification efforts] Link: https://lkml.kernel.org/r/20250711055509.91587-1-youling.tang@linux.dev Co-developed-by: Chi Zhiling Signed-off-by: Chi Zhiling Signed-off-by: Youling Tang Signed-off-by: Klara Modin Reviewed-by: Ryan Roberts Reviewed-by: Jan Kara Cc: Matthew Wilcox (Oracle) Cc: Youling Tang Cc: David Hildenbrand Cc: Klara Modin Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 6 ++++++ mm/filemap.c | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index aec4a11565bc..185644e288ea 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -482,6 +482,12 @@ mapping_min_folio_nrpages(const struct address_space *mapping) return 1UL << mapping_min_folio_order(mapping); } +static inline unsigned long +mapping_min_folio_nrbytes(const struct address_space *mapping) +{ + return mapping_min_folio_nrpages(mapping) << PAGE_SHIFT; +} + /** * mapping_align_index() - Align index for this mapping. * @mapping: The address_space. diff --git a/mm/filemap.c b/mm/filemap.c index cd9387b0a5b5..344ab106c21c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2600,8 +2600,9 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, unsigned int flags; int err = 0; - /* "last_index" is the index of the page beyond the end of the read */ - last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); + /* "last_index" is the index of the folio beyond the end of the read */ + last_index = round_up(iocb->ki_pos + count, + mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT; retry: if (fatal_signal_pending(current)) return -EINTR; -- cgit v1.2.3 From 94326d3130b5e78a35265bbf7822148372b39231 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 3 Sep 2025 20:10:39 +0100 Subject: mm: remove mlock_count from struct page All users now use folio->mlock_count so we can remove this element of struct page. Move the useful comments over to struct folio. Link: https://lkml.kernel.org/r/20250903191041.1630338-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 275e8060d918..ff2b4e13215f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -94,14 +94,6 @@ struct page { union { struct list_head lru; - /* Or, for the Unevictable "LRU list" slot */ - struct { - /* Always even, to negate PageTail */ - void *__filler; - /* Count page's or folio's mlocks */ - unsigned int mlock_count; - }; - /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; @@ -391,7 +383,9 @@ struct folio { union { struct list_head lru; /* private: avoid cluttering the output */ + /* For the Unevictable "LRU list" slot */ struct { + /* Avoid compound_head */ void *__filler; /* public: */ unsigned int mlock_count; -- cgit v1.2.3 From da939ef4c494246bc2102ecb628bbcc71d650410 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 2 Sep 2025 08:35:11 +0000 Subject: rust: maple_tree: add MapleTree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Add Rust abstraction for Maple Trees", v3. This will be used in the Tyr driver [1] to allocate from the GPU's VA space that is not owned by userspace, but by the kernel, for kernel GPU mappings. Danilo tells me that in nouveau, the maple tree is used for keeping track of "VM regions" on top of GPUVM, and that he will most likely end up doing the same in the Rust Nova driver as well. These abstractions intentionally do not expose any way to make use of external locking. You are required to use the internal spinlock. For now, we do not support loads that only utilize rcu for protection. This contains some parts taken from Andrew Ballance's RFC [2] from April. However, it has also been reworked significantly compared to that RFC taking the use-cases in Tyr into account. This patch (of 3): The maple tree will be used in the Tyr driver to allocate and keep track of GPU allocations created internally (i.e. not by userspace). It will likely also be used in the Nova driver eventually. This adds the simplest methods for additional and removal that do not require any special care with respect to concurrency. This implementation is based on the RFC by Andrew but with significant changes to simplify the implementation. [ojeda@kernel.org: fix intra-doc links] Link: https://lkml.kernel.org/r/20250910140212.997771-1-ojeda@kernel.org Link: https://lkml.kernel.org/r/20250902-maple-tree-v3-0-fb5c8958fb1e@google.com Link: https://lkml.kernel.org/r/20250902-maple-tree-v3-1-fb5c8958fb1e@google.com Link: https://lore.kernel.org/r/20250627-tyr-v1-1-cb5f4c6ced46@collabora.com [1] Link: https://lore.kernel.org/r/20250405060154.1550858-1-andrewjballance@gmail.com [2] Co-developed-by: Andrew Ballance Signed-off-by: Andrew Ballance Signed-off-by: Alice Ryhl Reviewed-by: Danilo Krummrich Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Daniel Almeida Cc: Gary Guo Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Miguel Ojeda Cc: Trevor Gross Signed-off-by: Andrew Morton --- MAINTAINERS | 4 + include/linux/maple_tree.h | 3 + rust/helpers/helpers.c | 1 + rust/helpers/maple_tree.c | 8 ++ rust/kernel/lib.rs | 1 + rust/kernel/maple_tree.rs | 349 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 366 insertions(+) create mode 100644 rust/helpers/maple_tree.c create mode 100644 rust/kernel/maple_tree.rs (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index a7e123ddf05a..68d29f0220fc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14672,6 +14672,8 @@ F: net/mctp/ MAPLE TREE M: Liam R. Howlett +R: Alice Ryhl +R: Andrew Ballance L: maple-tree@lists.infradead.org L: linux-mm@kvack.org S: Supported @@ -14680,6 +14682,8 @@ F: include/linux/maple_tree.h F: include/trace/events/maple_tree.h F: lib/maple_tree.c F: lib/test_maple_tree.c +F: rust/helpers/maple_tree.c +F: rust/kernel/maple_tree.rs F: tools/testing/radix-tree/maple.c F: tools/testing/shared/linux/maple_tree.h diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 41e633264e51..05730171d201 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -481,6 +481,9 @@ struct ma_wr_state { #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) +/* + * When changing MA_STATE, remember to also change rust/kernel/maple_tree.rs + */ #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7cf7fe95e41d..c5d42e0f7ce6 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -26,6 +26,7 @@ #include "io.c" #include "jump_label.c" #include "kunit.c" +#include "maple_tree.c" #include "mm.c" #include "mutex.c" #include "of.c" diff --git a/rust/helpers/maple_tree.c b/rust/helpers/maple_tree.c new file mode 100644 index 000000000000..1dd9ac84a13f --- /dev/null +++ b/rust/helpers/maple_tree.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void rust_helper_mt_init_flags(struct maple_tree *mt, unsigned int flags) +{ + mt_init_flags(mt, flags); +} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index ed53169e795c..6b0a5689669f 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -96,6 +96,7 @@ pub mod jump_label; #[cfg(CONFIG_KUNIT)] pub mod kunit; pub mod list; +pub mod maple_tree; pub mod miscdevice; pub mod mm; #[cfg(CONFIG_NET)] diff --git a/rust/kernel/maple_tree.rs b/rust/kernel/maple_tree.rs new file mode 100644 index 000000000000..319772878b89 --- /dev/null +++ b/rust/kernel/maple_tree.rs @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Maple trees. +//! +//! C header: [`include/linux/maple_tree.h`](srctree/include/linux/maple_tree.h) +//! +//! Reference: + +use core::{ + marker::PhantomData, + ops::{Bound, RangeBounds}, + ptr, +}; + +use kernel::{ + alloc::Flags, + error::to_result, + prelude::*, + types::{ForeignOwnable, Opaque}, +}; + +/// A maple tree optimized for storing non-overlapping ranges. +/// +/// # Invariants +/// +/// Each range in the maple tree owns an instance of `T`. +#[pin_data(PinnedDrop)] +#[repr(transparent)] +pub struct MapleTree { + #[pin] + tree: Opaque, + _p: PhantomData, +} + +#[inline] +fn to_maple_range(range: impl RangeBounds) -> Option<(usize, usize)> { + let first = match range.start_bound() { + Bound::Included(start) => *start, + Bound::Excluded(start) => start.checked_add(1)?, + Bound::Unbounded => 0, + }; + + let last = match range.end_bound() { + Bound::Included(end) => *end, + Bound::Excluded(end) => end.checked_sub(1)?, + Bound::Unbounded => usize::MAX, + }; + + if last < first { + return None; + } + + Some((first, last)) +} + +impl MapleTree { + /// Create a new maple tree. + /// + /// The tree will use the regular implementation with a higher branching factor, rather than + /// the allocation tree. + #[inline] + pub fn new() -> impl PinInit { + pin_init!(MapleTree { + // SAFETY: This initializes a maple tree into a pinned slot. The maple tree will be + // destroyed in Drop before the memory location becomes invalid. + tree <- Opaque::ffi_init(|slot| unsafe { bindings::mt_init_flags(slot, 0) }), + _p: PhantomData, + }) + } + + /// Insert the value at the given index. + /// + /// # Errors + /// + /// If the maple tree already contains a range using the given index, then this call will + /// return an [`InsertErrorKind::Occupied`]. It may also fail if memory allocation fails. + /// + /// # Examples + /// + /// ``` + /// use kernel::maple_tree::{InsertErrorKind, MapleTree}; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// let the_answer = KBox::new(42, GFP_KERNEL)?; + /// + /// // These calls will succeed. + /// tree.insert(100, ten, GFP_KERNEL)?; + /// tree.insert(101, twenty, GFP_KERNEL)?; + /// + /// // This will fail because the index is already in use. + /// assert_eq!( + /// tree.insert(100, the_answer, GFP_KERNEL).unwrap_err().cause, + /// InsertErrorKind::Occupied, + /// ); + /// # Ok::<_, Error>(()) + /// ``` + #[inline] + pub fn insert(&self, index: usize, value: T, gfp: Flags) -> Result<(), InsertError> { + self.insert_range(index..=index, value, gfp) + } + + /// Insert a value to the specified range, failing on overlap. + /// + /// This accepts the usual types of Rust ranges using the `..` and `..=` syntax for exclusive + /// and inclusive ranges respectively. The range must not be empty, and must not overlap with + /// any existing range. + /// + /// # Errors + /// + /// If the maple tree already contains an overlapping range, then this call will return an + /// [`InsertErrorKind::Occupied`]. It may also fail if memory allocation fails or if the + /// requested range is invalid (e.g. empty). + /// + /// # Examples + /// + /// ``` + /// use kernel::maple_tree::{InsertErrorKind, MapleTree}; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// let the_answer = KBox::new(42, GFP_KERNEL)?; + /// let hundred = KBox::new(100, GFP_KERNEL)?; + /// + /// // Insert the value 10 at the indices 100 to 499. + /// tree.insert_range(100..500, ten, GFP_KERNEL)?; + /// + /// // Insert the value 20 at the indices 500 to 1000. + /// tree.insert_range(500..=1000, twenty, GFP_KERNEL)?; + /// + /// // This will fail due to overlap with the previous range on index 1000. + /// assert_eq!( + /// tree.insert_range(1000..1200, the_answer, GFP_KERNEL).unwrap_err().cause, + /// InsertErrorKind::Occupied, + /// ); + /// + /// // When using .. to specify the range, you must be careful to ensure that the range is + /// // non-empty. + /// assert_eq!( + /// tree.insert_range(72..72, hundred, GFP_KERNEL).unwrap_err().cause, + /// InsertErrorKind::InvalidRequest, + /// ); + /// # Ok::<_, Error>(()) + /// ``` + pub fn insert_range(&self, range: R, value: T, gfp: Flags) -> Result<(), InsertError> + where + R: RangeBounds, + { + let Some((first, last)) = to_maple_range(range) else { + return Err(InsertError { + value, + cause: InsertErrorKind::InvalidRequest, + }); + }; + + let ptr = T::into_foreign(value); + + // SAFETY: The tree is valid, and we are passing a pointer to an owned instance of `T`. + let res = to_result(unsafe { + bindings::mtree_insert_range(self.tree.get(), first, last, ptr, gfp.as_raw()) + }); + + if let Err(err) = res { + // SAFETY: As `mtree_insert_range` failed, it is safe to take back ownership. + let value = unsafe { T::from_foreign(ptr) }; + + let cause = if err == ENOMEM { + InsertErrorKind::AllocError(kernel::alloc::AllocError) + } else if err == EEXIST { + InsertErrorKind::Occupied + } else { + InsertErrorKind::InvalidRequest + }; + Err(InsertError { value, cause }) + } else { + Ok(()) + } + } + + /// Erase the range containing the given index. + /// + /// # Examples + /// + /// ``` + /// use kernel::maple_tree::MapleTree; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// + /// tree.insert_range(100..500, ten, GFP_KERNEL)?; + /// tree.insert(67, twenty, GFP_KERNEL)?; + /// + /// assert_eq!(tree.erase(67).map(|v| *v), Some(20)); + /// assert_eq!(tree.erase(275).map(|v| *v), Some(10)); + /// + /// // The previous call erased the entire range, not just index 275. + /// assert!(tree.erase(127).is_none()); + /// # Ok::<_, Error>(()) + /// ``` + #[inline] + pub fn erase(&self, index: usize) -> Option { + // SAFETY: `self.tree` contains a valid maple tree. + let ret = unsafe { bindings::mtree_erase(self.tree.get(), index) }; + + // SAFETY: If the pointer is not null, then we took ownership of a valid instance of `T` + // from the tree. + unsafe { T::try_from_foreign(ret) } + } + + /// Free all `T` instances in this tree. + /// + /// # Safety + /// + /// This frees Rust data referenced by the maple tree without removing it from the maple tree, + /// leaving it in an invalid state. The caller must ensure that this invalid state cannot be + /// observed by the end-user. + unsafe fn free_all_entries(self: Pin<&mut Self>) { + // SAFETY: The caller provides exclusive access to the entire maple tree, so we have + // exclusive access to the entire maple tree despite not holding the lock. + let mut ma_state = unsafe { MaState::new_raw(self.into_ref().get_ref(), 0, usize::MAX) }; + + loop { + // This uses the raw accessor because we're destroying pointers without removing them + // from the maple tree, which is only valid because this is the destructor. + let ptr = ma_state.mas_find_raw(usize::MAX); + if ptr.is_null() { + break; + } + // SAFETY: By the type invariants, this pointer references a valid value of type `T`. + // By the safety requirements, it is okay to free it without removing it from the maple + // tree. + drop(unsafe { T::from_foreign(ptr) }); + } + } +} + +#[pinned_drop] +impl PinnedDrop for MapleTree { + #[inline] + fn drop(mut self: Pin<&mut Self>) { + // We only iterate the tree if the Rust value has a destructor. + if core::mem::needs_drop::() { + // SAFETY: Other than the below `mtree_destroy` call, the tree will not be accessed + // after this call. + unsafe { self.as_mut().free_all_entries() }; + } + + // SAFETY: The tree is valid, and will not be accessed after this call. + unsafe { bindings::mtree_destroy(self.tree.get()) }; + } +} + +/// A helper type used for navigating a [`MapleTree`]. +/// +/// # Invariants +/// +/// For the duration of `'tree`: +/// +/// * The `ma_state` references a valid `MapleTree`. +/// * The `ma_state` has read/write access to the tree. +pub struct MaState<'tree, T: ForeignOwnable> { + state: bindings::ma_state, + _phantom: PhantomData<&'tree mut MapleTree>, +} + +impl<'tree, T: ForeignOwnable> MaState<'tree, T> { + /// Initialize a new `MaState` with the given tree. + /// + /// # Safety + /// + /// The caller must ensure that this `MaState` has read/write access to the maple tree. + #[inline] + unsafe fn new_raw(mt: &'tree MapleTree, first: usize, end: usize) -> Self { + // INVARIANT: + // * Having a reference ensures that the `MapleTree` is valid for `'tree`. + // * The caller ensures that we have read/write access. + Self { + state: bindings::ma_state { + tree: mt.tree.get(), + index: first, + last: end, + node: ptr::null_mut(), + status: bindings::maple_status_ma_start, + min: 0, + max: usize::MAX, + alloc: ptr::null_mut(), + mas_flags: 0, + store_type: bindings::store_type_wr_invalid, + ..Default::default() + }, + _phantom: PhantomData, + } + } + + #[inline] + fn as_raw(&mut self) -> *mut bindings::ma_state { + &raw mut self.state + } + + #[inline] + fn mas_find_raw(&mut self, max: usize) -> *mut c_void { + // SAFETY: By the type invariants, the `ma_state` is active and we have read/write access + // to the tree. + unsafe { bindings::mas_find(self.as_raw(), max) } + } +} + +/// Error type for failure to insert a new value. +pub struct InsertError { + /// The value that could not be inserted. + pub value: T, + /// The reason for the failure to insert. + pub cause: InsertErrorKind, +} + +/// The reason for the failure to insert. +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +pub enum InsertErrorKind { + /// There is already a value in the requested range. + Occupied, + /// Failure to allocate memory. + AllocError(kernel::alloc::AllocError), + /// The insertion request was invalid. + InvalidRequest, +} + +impl From for Error { + #[inline] + fn from(kind: InsertErrorKind) -> Error { + match kind { + InsertErrorKind::Occupied => EEXIST, + InsertErrorKind::AllocError(kernel::alloc::AllocError) => ENOMEM, + InsertErrorKind::InvalidRequest => EINVAL, + } + } +} + +impl From> for Error { + #[inline] + fn from(insert_err: InsertError) -> Error { + Error::from(insert_err.cause) + } +} -- cgit v1.2.3 From a488ba3124c82d704963fcd760fe653df1987b13 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Fri, 5 Sep 2025 17:00:12 +0200 Subject: huge_memory: return -EINVAL in folio split functions when THP is disabled split_huge_page_to_list_[to_order](), split_huge_page() and try_folio_split() return 0 on success and error codes on failure. When THP is disabled, these functions return 0 indicating success even though an error code should be returned as it is not possible to split a folio when THP is disabled. Make all these functions return -EINVAL to indicate failure instead of 0. As large folios depend on CONFIG_THP, issue warning as this function should not be called without a large folio. Link: https://lkml.kernel.org/r/20250905150012.93714-1-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509051753.riCeG7LC-lkp@intel.com/ Acked-by: David Hildenbrand Acked-by: Zi Yan Acked-by: Kiryl Shutsemau Reviewed-by: Lorenzo Stoakes Reviewed-by: Barry Song Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 29ef70022da1..f327d62fc985 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -588,22 +588,26 @@ static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { - return 0; + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; } static inline int split_huge_page(struct page *page) { - return 0; + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; } static inline int split_folio_to_list(struct folio *folio, struct list_head *list) { - return 0; + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; } static inline int try_folio_split(struct folio *folio, struct page *page, struct list_head *list) { - return 0; + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; } static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} -- cgit v1.2.3 From 614d850efda98e4455e4f2b55e64864f68a4e370 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 4 Sep 2025 08:59:26 +1000 Subject: mm/memremap: remove unused get_dev_pagemap() parameter GUP no longer uses get_dev_pagemap(). As it was the only user of the get_dev_pagemap() pgmap caching feature it can be removed. Link: https://lkml.kernel.org/r/20250903225926.34702-2-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Cc: John Hubbard Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/memremap.h | 6 ++---- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/memremap.c | 22 ++++------------------ 4 files changed, 8 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index aa1b6aa877a0..e5951ba12a28 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -211,8 +211,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); -struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap); +struct dev_pagemap *get_dev_pagemap(unsigned long pfn); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); @@ -234,8 +233,7 @@ static inline void devm_memunmap_pages(struct device *dev, { } -static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) +static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn) { return NULL; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2a95b41e0535..6d9134e3d115 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2194,7 +2194,7 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; if (pfn_valid(pfn)) { - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); if (pgmap) { res = memory_failure_dev_pagemap(pfn, flags, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 74318c787715..883b8e4d51ba 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -375,7 +375,7 @@ struct page *pfn_to_online_page(unsigned long pfn) * the section may be 'offline' but 'valid'. Only * get_dev_pagemap() can determine sub-section online status. */ - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_dev_pagemap(pgmap); /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ diff --git a/mm/memremap.c b/mm/memremap.c index a2d4bb88f64b..46cb1b0b6f72 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -153,14 +153,14 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, "altmap not supported for multiple ranges\n")) return -EINVAL; - conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start)); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); return -ENOMEM; } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end)); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); @@ -397,26 +397,12 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages); /** * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn * @pfn: page frame number to lookup page_map - * @pgmap: optional known pgmap that already has a reference - * - * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap - * is non-NULL but does not cover @pfn the reference to it will be released. */ -struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) +struct dev_pagemap *get_dev_pagemap(unsigned long pfn) { + struct dev_pagemap *pgmap; resource_size_t phys = PFN_PHYS(pfn); - /* - * In the cached case we're already holding a live reference. - */ - if (pgmap) { - if (phys >= pgmap->range.start && phys <= pgmap->range.end) - return pgmap; - put_dev_pagemap(pgmap); - } - - /* fall back to slow path lookup */ rcu_read_lock(); pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref)) -- cgit v1.2.3 From 4522aed4fffbbd18ab3581d733d0572d45780d07 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:51 +0800 Subject: mm, swap: rename and move some swap cluster definition and helpers No feature change, move cluster related definitions and helpers to mm/swap.h, also tidy up and add a "swap_" prefix for cluster lock/unlock helpers, so they can be used outside of swap files. And while at it, add kerneldoc. Link: https://lkml.kernel.org/r/20250916160100.31545-7-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Reviewed-by: Barry Song Acked-by: Chris Li Acked-by: David Hildenbrand Suggested-by: Chris Li Acked-by: Nhat Pham Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/swap.h | 34 ------------------ mm/swap.h | 70 +++++++++++++++++++++++++++++++++++++ mm/swapfile.c | 97 ++++++++++++++++------------------------------------ 3 files changed, 99 insertions(+), 102 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index a2bb20841616..78cc48a65512 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -235,40 +235,6 @@ enum { /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ -/* - * We use this to track usage of a cluster. A cluster is a block of swap disk - * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All - * free clusters are organized into a list. We fetch an entry from the list to - * get a free cluster. - * - * The flags field determines if a cluster is free. This is - * protected by cluster lock. - */ -struct swap_cluster_info { - spinlock_t lock; /* - * Protect swap_cluster_info fields - * other than list, and swap_info_struct->swap_map - * elements corresponding to the swap cluster. - */ - u16 count; - u8 flags; - u8 order; - struct list_head list; -}; - -/* All on-list cluster must have a non-zero flag. */ -enum swap_cluster_flags { - CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ - CLUSTER_FLAG_FREE, - CLUSTER_FLAG_NONFULL, - CLUSTER_FLAG_FRAG, - /* Clusters with flags above are allocatable */ - CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, - CLUSTER_FLAG_FULL, - CLUSTER_FLAG_DISCARD, - CLUSTER_FLAG_MAX, -}; - /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the diff --git a/mm/swap.h b/mm/swap.h index 7d868f8de696..138b5197c35e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -7,10 +7,80 @@ struct swap_iocb; extern int page_cluster; +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER HPAGE_PMD_NR +#define swap_entry_order(order) (order) +#else +#define SWAPFILE_CLUSTER 256 +#define swap_entry_order(order) 0 +#endif + +/* + * We use this to track usage of a cluster. A cluster is a block of swap disk + * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * + * The flags field determines if a cluster is free. This is + * protected by cluster lock. + */ +struct swap_cluster_info { + spinlock_t lock; /* + * Protect swap_cluster_info fields + * other than list, and swap_info_struct->swap_map + * elements corresponding to the swap cluster. + */ + u16 count; + u8 flags; + u8 order; + struct list_head list; +}; + +/* All on-list cluster must have a non-zero flag. */ +enum swap_cluster_flags { + CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ + CLUSTER_FLAG_FREE, + CLUSTER_FLAG_NONFULL, + CLUSTER_FLAG_FRAG, + /* Clusters with flags above are allocatable */ + CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, + CLUSTER_FLAG_FULL, + CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_MAX, +}; + #ifdef CONFIG_SWAP #include /* for swp_offset */ #include /* for bio_end_io_t */ +static inline struct swap_cluster_info *swp_offset_cluster( + struct swap_info_struct *si, pgoff_t offset) +{ + return &si->cluster_info[offset / SWAPFILE_CLUSTER]; +} + +/** + * swap_cluster_lock - Lock and return the swap cluster of given offset. + * @si: swap device the cluster belongs to. + * @offset: the swap entry offset, pointing to a valid slot. + * + * Context: The caller must ensure the offset is in the valid range and + * protect the swap device with reference count or locks. + */ +static inline struct swap_cluster_info *swap_cluster_lock( + struct swap_info_struct *si, unsigned long offset) +{ + struct swap_cluster_info *ci = swp_offset_cluster(si, offset); + + spin_lock(&ci->lock); + return ci; +} + +static inline void swap_cluster_unlock(struct swap_cluster_info *ci) +{ + spin_unlock(&ci->lock); +} + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; diff --git a/mm/swapfile.c b/mm/swapfile.c index c3c3364cb42e..700e07cb1cbd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -58,9 +58,6 @@ static void swap_entries_free(struct swap_info_struct *si, static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset); -static inline void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -258,9 +255,9 @@ again: * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); need_reclaim = swap_only_has_cache(si, offset, nr_pages); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -385,19 +382,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -#ifdef CONFIG_THP_SWAP -#define SWAPFILE_CLUSTER HPAGE_PMD_NR - -#define swap_entry_order(order) (order) -#else -#define SWAPFILE_CLUSTER 256 - -/* - * Define swap_entry_order() as constant to let compiler to optimize - * out some code if !CONFIG_THP_SWAP - */ -#define swap_entry_order(order) 0 -#endif #define LATENCY_LIMIT 256 static inline bool cluster_is_empty(struct swap_cluster_info *info) @@ -425,34 +409,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } -static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, - unsigned long offset) -{ - return &si->cluster_info[offset / SWAPFILE_CLUSTER]; -} - static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset) -{ - struct swap_cluster_info *ci; - - ci = offset_to_cluster(si, offset); - spin_lock(&ci->lock); - - return ci; -} - -static inline void unlock_cluster(struct swap_cluster_info *ci) -{ - spin_unlock(&ci->lock); -} - static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) @@ -808,7 +770,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, } out: relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (si->flags & SWP_SOLIDSTATE) { this_cpu_write(percpu_swap_cluster.offset[order], next); this_cpu_write(percpu_swap_cluster.si[order], si); @@ -875,7 +837,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (ci->flags == CLUSTER_FLAG_NONE) relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (to_scan <= 0) break; } @@ -914,7 +876,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (offset == SWAP_ENTRY_INVALID) goto new_cluster; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); /* Cluster could have been used by another order */ if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) @@ -922,7 +884,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o found = alloc_swap_scan_cluster(si, ci, offset, order, usage); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } if (found) goto done; @@ -1203,7 +1165,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (!si || !offset || !get_swap_device_info(si)) return false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); @@ -1211,7 +1173,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (found) *entry = swp_entry(si->type, found); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } put_swap_device(si); @@ -1479,14 +1441,14 @@ static void swap_entries_put_cache(struct swap_info_struct *si, unsigned long offset = swp_offset(entry); struct swap_cluster_info *ci; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, nr)) + ci = swap_cluster_lock(si, offset); + if (swap_only_has_cache(si, offset, nr)) { swap_entries_free(si, ci, entry, nr); - else { + } else { for (int i = 0; i < nr; i++, entry.val++) swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); } - unlock_cluster(ci); + swap_cluster_unlock(ci); } static bool swap_entries_put_map(struct swap_info_struct *si, @@ -1504,7 +1466,7 @@ static bool swap_entries_put_map(struct swap_info_struct *si, if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { goto locked_fallback; } @@ -1513,21 +1475,20 @@ static bool swap_entries_put_map(struct swap_info_struct *si, else for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; fallback: - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); locked_fallback: for (i = 0; i < nr; i++, entry.val++) { count = swap_entry_put_locked(si, ci, entry, 1); if (count == SWAP_HAS_CACHE) has_cache = true; } - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; - } /* @@ -1577,7 +1538,7 @@ static void swap_entries_free(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ - VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(ci != swp_offset_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); @@ -1652,9 +1613,9 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) struct swap_cluster_info *ci; int count; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster(ci); + swap_cluster_unlock(ci); return !!count; } @@ -1677,7 +1638,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1700,7 +1661,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return count; } @@ -1715,7 +1676,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; @@ -1728,7 +1689,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return ret; } @@ -2662,8 +2623,8 @@ static void wait_for_allocation(struct swap_info_struct *si) BUG_ON(si->flags & SWP_WRITEOK); for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { - ci = lock_cluster(si, offset); - unlock_cluster(ci); + ci = swap_cluster_lock(si, offset); + swap_cluster_unlock(ci); } } @@ -3579,7 +3540,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3634,7 +3595,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return err; } @@ -3733,7 +3694,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); @@ -3793,7 +3754,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) out_unlock_cont: spin_unlock(&si->cont_lock); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); put_swap_device(si); outer: if (page) -- cgit v1.2.3 From 0fcf8ef4fdab8e5c91d1bce39c7fe6565974ffad Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:52 +0800 Subject: mm, swap: tidy up swap device and cluster info helpers swp_swap_info is the most commonly used helper for retrieving swap info. It has an internal check that may lead to a NULL return value, but almost none of its caller checks the return value, making the internal check pointless. In fact, most of these callers already ensured the entry is valid and never expect a NULL value. Tidy this up and improve the function names. If the caller can make sure the swap entry/type is valid and the device is pinned, use the new introduced __swap_entry_to_info/__swap_type_to_info instead. They have more debug sanity checks and lower overhead as they are inlined. Callers that may expect a NULL value should use swap_entry_to_info/swap_type_to_info instead. No feature change. The rearranged codes should have had no effect, or they should have been hitting NULL de-ref bugs already. Only some new sanity checks are added so potential issues may show up in debug build. The new helpers will be frequently used with swap table later when working with swap cache folios. A locked swap cache folio ensures the entries are valid and stable so these helpers are very helpful. Link: https://lkml.kernel.org/r/20250916160100.31545-8-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Barry Song Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ------ mm/page_io.c | 12 ++++++------ mm/swap.h | 38 +++++++++++++++++++++++++++++++++----- mm/swap_state.c | 4 ++-- mm/swapfile.c | 37 +++++++++++++++++++------------------ 5 files changed, 60 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 78cc48a65512..762f8db0e811 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -479,7 +479,6 @@ extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); @@ -492,11 +491,6 @@ static inline void put_swap_device(struct swap_info_struct *si) } #else /* CONFIG_SWAP */ -static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return NULL; -} - static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; diff --git a/mm/page_io.c b/mm/page_io.c index a2056a5ecb13..3c342db77ce3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -204,7 +204,7 @@ static bool is_folio_zero_filled(struct folio *folio) static void swap_zeromap_folio_set(struct folio *folio) { struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio); - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); int nr_pages = folio_nr_pages(folio); swp_entry_t entry; unsigned int i; @@ -223,7 +223,7 @@ static void swap_zeromap_folio_set(struct folio *folio) static void swap_zeromap_folio_clear(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); swp_entry_t entry; unsigned int i; @@ -374,7 +374,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_iocb *sio = swap_plug ? *swap_plug : NULL; - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct file *swap_file = sis->swap_file; loff_t pos = swap_dev_pos(folio->swap); @@ -446,7 +446,7 @@ static void swap_writepage_bdev_async(struct folio *folio, void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); /* @@ -537,7 +537,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_iocb *sio = NULL; loff_t pos = swap_dev_pos(folio->swap); @@ -608,7 +608,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO; bool workingset = folio_test_workingset(folio); unsigned long pflags; diff --git a/mm/swap.h b/mm/swap.h index 138b5197c35e..30b1039c27fe 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -15,6 +15,8 @@ extern int page_cluster; #define swap_entry_order(order) 0 #endif +extern struct swap_info_struct *swap_info[]; + /* * We use this to track usage of a cluster. A cluster is a block of swap disk * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All @@ -53,9 +55,29 @@ enum swap_cluster_flags { #include /* for swp_offset */ #include /* for bio_end_io_t */ -static inline struct swap_cluster_info *swp_offset_cluster( +/* + * Callers of all helpers below must ensure the entry, type, or offset is + * valid, and protect the swap device with reference count or locks. + */ +static inline struct swap_info_struct *__swap_type_to_info(int type) +{ + struct swap_info_struct *si; + + si = READ_ONCE(swap_info[type]); /* rcu_dereference() */ + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + return si; +} + +static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) +{ + return __swap_type_to_info(swp_type(entry)); +} + +static inline struct swap_cluster_info *__swap_offset_to_cluster( struct swap_info_struct *si, pgoff_t offset) { + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + VM_WARN_ON_ONCE(offset >= si->max); return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } @@ -70,8 +92,9 @@ static inline struct swap_cluster_info *swp_offset_cluster( static inline struct swap_cluster_info *swap_cluster_lock( struct swap_info_struct *si, unsigned long offset) { - struct swap_cluster_info *ci = swp_offset_cluster(si, offset); + struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ spin_lock(&ci->lock); return ci; } @@ -170,7 +193,7 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, static inline unsigned int folio_swap_flags(struct folio *folio) { - return swp_swap_info(folio->swap)->flags; + return __swap_entry_to_info(folio->swap)->flags; } /* @@ -181,7 +204,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio) static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, bool *is_zeromap) { - struct swap_info_struct *sis = swp_swap_info(entry); + struct swap_info_struct *sis = __swap_entry_to_info(entry); unsigned long start = swp_offset(entry); unsigned long end = start + max_nr; bool first_bit; @@ -200,7 +223,7 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); int i; @@ -219,6 +242,11 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) #else /* CONFIG_SWAP */ struct swap_iocb; +static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) +{ + return NULL; +} + static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 9225d6b695ad..0ad4f3b41f1b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -336,7 +336,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; struct folio *new_folio = NULL; struct folio *result = NULL; @@ -560,7 +560,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; diff --git a/mm/swapfile.c b/mm/swapfile.c index 700e07cb1cbd..6f7a8c98d14d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -102,7 +102,7 @@ static PLIST_HEAD(swap_active_head); static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -124,14 +124,20 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { .lock = INIT_LOCAL_LOCK(), }; -static struct swap_info_struct *swap_type_to_swap_info(int type) +/* May return NULL on invalid type, caller must check for NULL return */ +static struct swap_info_struct *swap_type_to_info(int type) { if (type >= MAX_SWAPFILES) return NULL; - return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } +/* May return NULL on invalid entry, caller must check for NULL return */ +static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) +{ + return swap_type_to_info(swp_type(entry)); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ @@ -342,7 +348,7 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) sector_t swap_folio_sector(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; @@ -1300,7 +1306,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (data_race(!(si->flags & SWP_USED))) @@ -1415,7 +1421,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (!get_swap_device_info(si)) @@ -1538,7 +1544,7 @@ static void swap_entries_free(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ - VM_BUG_ON(ci != swp_offset_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); @@ -1596,7 +1602,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); return swap_count(si->swap_map[offset]); @@ -1827,7 +1833,7 @@ out: swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); unsigned long offset; swp_entry_t entry = {0}; @@ -1908,7 +1914,7 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) @@ -2837,7 +2843,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; (si = swap_type_to_swap_info(type)); type++) { + for (type = 0; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2858,7 +2864,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) type = si->type + 1; ++(*pos); - for (; (si = swap_type_to_swap_info(type)); type++) { + for (; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; @@ -3531,7 +3537,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) unsigned char has_cache; int err, i; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (WARN_ON_ONCE(!si)) { pr_err("%s%08lx\n", Bad_file, entry.val); return -EINVAL; @@ -3646,11 +3652,6 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) swap_entries_put_cache(si, entry, nr); } -struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return swap_type_to_swap_info(swp_type(entry)); -} - /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's -- cgit v1.2.3 From 8578e0c00dcf0c58fbc32d4904ecaf8e802a6590 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:56 +0800 Subject: mm, swap: use the swap table for the swap cache and switch API Introduce basic swap table infrastructures, which are now just a fixed-sized flat array inside each swap cluster, with access wrappers. Each cluster contains a swap table of 512 entries. Each table entry is an opaque atomic long. It could be in 3 types: a shadow type (XA_VALUE), a folio type (pointer), or NULL. In this first step, it only supports storing a folio or shadow, and it is a drop-in replacement for the current swap cache. Convert all swap cache users to use the new sets of APIs. Chris Li has been suggesting using a new infrastructure for swap cache for better performance, and that idea combined well with the swap table as the new backing structure. Now the lock contention range is reduced to 2M clusters, which is much smaller than the 64M address_space. And we can also drop the multiple address_space design. All the internal works are done with swap_cache_get_* helpers. Swap cache lookup is still lock-less like before, and the helper's contexts are same with original swap cache helpers. They still require a pin on the swap device to prevent the backing data from being freed. Swap cache updates are now protected by the swap cluster lock instead of the XArray lock. This is mostly handled internally, but new __swap_cache_* helpers require the caller to lock the cluster. So, a few new cluster access and locking helpers are also introduced. A fully cluster-based unified swap table can be implemented on top of this to take care of all count tracking and synchronization work, with dynamic allocation. It should reduce the memory usage while making the performance even better. Link: https://lkml.kernel.org/r/20250916160100.31545-12-ryncsn@gmail.com Co-developed-by: Chris Li Signed-off-by: Chris Li Signed-off-by: Kairui Song Acked-by: Chris Li Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + include/linux/swap.h | 2 - mm/huge_memory.c | 13 ++- mm/migrate.c | 19 +++- mm/shmem.c | 8 +- mm/swap.h | 154 +++++++++++++++++++++------ mm/swap_state.c | 293 ++++++++++++++++++++++----------------------------- mm/swap_table.h | 97 +++++++++++++++++ mm/swapfile.c | 100 +++++++++++++----- mm/vmscan.c | 20 ++-- 10 files changed, 459 insertions(+), 248 deletions(-) create mode 100644 mm/swap_table.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 3d113bfc3c82..4c8bbf70a3c7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16232,6 +16232,7 @@ F: include/linux/swapops.h F: mm/page_io.c F: mm/swap.c F: mm/swap.h +F: mm/swap_table.h F: mm/swap_state.c F: mm/swapfile.c diff --git a/include/linux/swap.h b/include/linux/swap.h index 762f8db0e811..e818fbade1e2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -480,8 +480,6 @@ extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct backing_dev_info; -extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); -extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4c66e358685b..a9fc7a09167a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3720,7 +3720,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - struct address_space *swap_cache = NULL; + struct swap_cluster_info *ci = NULL; struct lruvec *lruvec; int expected_refs; @@ -3764,8 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto fail; } - swap_cache = swap_address_space(folio->swap); - xa_lock(&swap_cache->i_pages); + ci = swap_cluster_get_and_lock(folio); } /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ @@ -3797,8 +3796,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * Anonymous folio with swap cache. * NOTE: shmem in swap cache is not supported yet. */ - if (swap_cache) { - __swap_cache_replace_folio(folio, new_folio); + if (ci) { + __swap_cache_replace_folio(ci, folio, new_folio); continue; } @@ -3833,8 +3832,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, unlock_page_lruvec(lruvec); - if (swap_cache) - xa_unlock(&swap_cache->i_pages); + if (ci) + swap_cluster_unlock(ci); } else { spin_unlock(&ds_queue->split_queue_lock); ret = -EAGAIN; diff --git a/mm/migrate.c b/mm/migrate.c index c69cc13db692..aee61a980374 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -563,6 +563,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int expected_count) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + struct swap_cluster_info *ci = NULL; struct zone *oldzone, *newzone; int dirty; long nr = folio_nr_pages(folio); @@ -591,9 +592,16 @@ static int __folio_migrate_mapping(struct address_space *mapping, oldzone = folio_zone(folio); newzone = folio_zone(newfolio); - xas_lock_irq(&xas); + if (folio_test_swapcache(folio)) + ci = swap_cluster_get_and_lock_irq(folio); + else + xas_lock_irq(&xas); + if (!folio_ref_freeze(folio, expected_count)) { - xas_unlock_irq(&xas); + if (ci) + swap_cluster_unlock_irq(ci); + else + xas_unlock_irq(&xas); return -EAGAIN; } @@ -624,7 +632,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, } if (folio_test_swapcache(folio)) - __swap_cache_replace_folio(folio, newfolio); + __swap_cache_replace_folio(ci, folio, newfolio); else xas_store(&xas, newfolio); @@ -635,8 +643,11 @@ static int __folio_migrate_mapping(struct address_space *mapping, */ folio_ref_unfreeze(folio, expected_count - nr); - xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ + if (ci) + swap_cluster_unlock(ci); + else + xas_unlock(&xas); /* * If moved to a different zone then also account diff --git a/mm/shmem.c b/mm/shmem.c index bbfbbc1bc4d6..cf0171a72e47 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2083,9 +2083,9 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index, struct vm_area_struct *vma) { + struct swap_cluster_info *ci; struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; - struct address_space *swap_mapping = swap_address_space(entry); int nr_pages = folio_nr_pages(old); int error = 0; @@ -2116,12 +2116,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - xa_lock_irq(&swap_mapping->i_pages); - __swap_cache_replace_folio(old, new); + ci = swap_cluster_get_and_lock_irq(old); + __swap_cache_replace_folio(ci, old, new); mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); - xa_unlock_irq(&swap_mapping->i_pages); + swap_cluster_unlock_irq(ci); folio_add_lru(new); *foliop = new; diff --git a/mm/swap.h b/mm/swap.h index fe579c81c6c4..742db4d46d23 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -2,6 +2,7 @@ #ifndef _MM_SWAP_H #define _MM_SWAP_H +#include /* for atomic_long_t */ struct mempolicy; struct swap_iocb; @@ -35,6 +36,7 @@ struct swap_cluster_info { u16 count; u8 flags; u8 order; + atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */ struct list_head list; }; @@ -55,6 +57,11 @@ enum swap_cluster_flags { #include /* for swp_offset */ #include /* for bio_end_io_t */ +static inline unsigned int swp_cluster_offset(swp_entry_t entry) +{ + return swp_offset(entry) % SWAPFILE_CLUSTER; +} + /* * Callers of all helpers below must ensure the entry, type, or offset is * valid, and protect the swap device with reference count or locks. @@ -81,6 +88,25 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster( return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } +static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry) +{ + return __swap_offset_to_cluster(__swap_entry_to_info(entry), + swp_offset(entry)); +} + +static __always_inline struct swap_cluster_info *__swap_cluster_lock( + struct swap_info_struct *si, unsigned long offset, bool irq) +{ + struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + if (irq) + spin_lock_irq(&ci->lock); + else + spin_lock(&ci->lock); + return ci; +} + /** * swap_cluster_lock - Lock and return the swap cluster of given offset. * @si: swap device the cluster belongs to. @@ -92,11 +118,49 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster( static inline struct swap_cluster_info *swap_cluster_lock( struct swap_info_struct *si, unsigned long offset) { - struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + return __swap_cluster_lock(si, offset, false); +} - VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ - spin_lock(&ci->lock); - return ci; +static inline struct swap_cluster_info *__swap_cluster_get_and_lock( + const struct folio *folio, bool irq) +{ + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + return __swap_cluster_lock(__swap_entry_to_info(folio->swap), + swp_offset(folio->swap), irq); +} + +/* + * swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries. + * @folio: The folio. + * + * This locks and returns the swap cluster that contains a folio's swap + * entries. The swap entries of a folio are always in one single cluster. + * The folio has to be locked so its swap entries won't change and the + * cluster won't be freed. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * Return: Pointer to the swap cluster. + */ +static inline struct swap_cluster_info *swap_cluster_get_and_lock( + const struct folio *folio) +{ + return __swap_cluster_get_and_lock(folio, false); +} + +/* + * swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries. + * @folio: The folio. + * + * Same as swap_cluster_get_and_lock but also disable IRQ. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * Return: Pointer to the swap cluster. + */ +static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( + const struct folio *folio) +{ + return __swap_cluster_get_and_lock(folio, true); } static inline void swap_cluster_unlock(struct swap_cluster_info *ci) @@ -104,6 +168,11 @@ static inline void swap_cluster_unlock(struct swap_cluster_info *ci) spin_unlock(&ci->lock); } +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) +{ + spin_unlock_irq(&ci->lock); +} + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -123,10 +192,11 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) #define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) -extern struct address_space *swapper_spaces[]; -#define swap_address_space(entry) \ - (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ - >> SWAP_ADDRESS_SPACE_SHIFT]) +extern struct address_space swap_space; +static inline struct address_space *swap_address_space(swp_entry_t entry) +{ + return &swap_space; +} /* * Return the swap device position of the swap entry. @@ -136,15 +206,6 @@ static inline loff_t swap_dev_pos(swp_entry_t entry) return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; } -/* - * Return the swap cache index of the swap entry. - */ -static inline pgoff_t swap_cache_index(swp_entry_t entry) -{ - BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); - return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; -} - /** * folio_matches_swap_entry - Check if a folio matches a given swap entry. * @folio: The folio. @@ -180,14 +241,14 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, */ struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadow); +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow); void swap_cache_del_folio(struct folio *folio); -void __swap_cache_del_folio(struct folio *folio, - swp_entry_t entry, void *shadow); -void __swap_cache_replace_folio(struct folio *old, struct folio *new); -void swap_cache_clear_shadow(int type, unsigned long begin, - unsigned long end); +/* Below helpers require the caller to lock and pass in the swap cluster. */ +void __swap_cache_del_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry, void *shadow); +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new); +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents); void show_swap_cache_info(void); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); @@ -255,6 +316,32 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) #else /* CONFIG_SWAP */ struct swap_iocb; +static inline struct swap_cluster_info *swap_cluster_lock( + struct swap_info_struct *si, pgoff_t offset, bool irq) +{ + return NULL; +} + +static inline struct swap_cluster_info *swap_cluster_get_and_lock( + struct folio *folio) +{ + return NULL; +} + +static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( + struct folio *folio) +{ + return NULL; +} + +static inline void swap_cluster_unlock(struct swap_cluster_info *ci) +{ +} + +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) +{ +} + static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) { return NULL; @@ -272,11 +359,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } -static inline pgoff_t swap_cache_index(swp_entry_t entry) -{ - return 0; -} - static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry) { return false; @@ -323,21 +405,21 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -static inline int swap_cache_add_folio(swp_entry_t entry, struct folio *folio, - gfp_t gfp, void **shadow) +static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow) { - return -EINVAL; } static inline void swap_cache_del_folio(struct folio *folio) { } -static inline void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry, void *shadow) +static inline void __swap_cache_del_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry, void *shadow) { } -static inline void __swap_cache_replace_folio(struct folio *old, struct folio *new) +static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { } @@ -371,8 +453,10 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) */ static inline pgoff_t folio_index(struct folio *folio) { +#ifdef CONFIG_SWAP if (unlikely(folio_test_swapcache(folio))) - return swap_cache_index(folio->swap); + return swp_offset(folio->swap); +#endif return folio->index; } diff --git a/mm/swap_state.c b/mm/swap_state.c index d1f5b8fa52fc..2558a648d671 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -23,6 +23,7 @@ #include #include #include "internal.h" +#include "swap_table.h" #include "swap.h" /* @@ -36,8 +37,10 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; -static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +struct address_space swap_space __read_mostly = { + .a_ops = &swap_aops, +}; + static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_ORDER_CEILING 5 @@ -83,11 +86,20 @@ void show_swap_cache_info(void) */ struct folio *swap_cache_get_folio(swp_entry_t entry) { - struct folio *folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); - if (IS_ERR(folio)) - return NULL; - return folio; + unsigned long swp_tb; + struct folio *folio; + + for (;;) { + swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (!swp_tb_is_folio(swp_tb)) + return NULL; + folio = swp_tb_to_folio(swp_tb); + if (likely(folio_try_get(folio))) + return folio; + } + + return NULL; } /** @@ -100,13 +112,13 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) */ void *swap_cache_get_shadow(swp_entry_t entry) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - void *shadow; + unsigned long swp_tb; + + swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (swp_tb_is_shadow(swp_tb)) + return swp_tb_to_shadow(swp_tb); - shadow = xa_load(&address_space->i_pages, idx); - if (xa_is_value(shadow)) - return shadow; return NULL; } @@ -119,61 +131,48 @@ void *swap_cache_get_shadow(swp_entry_t entry) * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. - * The caller also needs to mark the corresponding swap_map slots with - * SWAP_HAS_CACHE to avoid race or conflict. - * Return: Returns 0 on success, error code otherwise. + * The caller also needs to update the corresponding swap_map slots with + * SWAP_HAS_CACHE bit to avoid race or conflict. */ -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp) +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); - unsigned long i, nr = folio_nr_pages(folio); - void *old; - - xas_set_update(&xas, workingset_update_node); - - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); + void *shadow = NULL; + unsigned long old_tb, new_tb; + struct swap_cluster_info *ci; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); + + new_tb = folio_to_swp_tb(folio); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + do { + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + if (swp_tb_is_shadow(old_tb)) + shadow = swp_tb_to_shadow(old_tb); + } while (++ci_off < ci_end); - folio_ref_add(folio, nr); + folio_ref_add(folio, nr_pages); folio_set_swapcache(folio); folio->swap = entry; + swap_cluster_unlock(ci); - do { - xas_lock_irq(&xas); - xas_create_range(&xas); - if (xas_error(&xas)) - goto unlock; - for (i = 0; i < nr; i++) { - VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); - if (shadowp) { - old = xas_load(&xas); - if (xa_is_value(old)) - *shadowp = old; - } - xas_store(&xas, folio); - xas_next(&xas); - } - address_space->nrpages += nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); -unlock: - xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); - - if (!xas_error(&xas)) - return 0; + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); - folio_clear_swapcache(folio); - folio_ref_sub(folio, nr); - return xas_error(&xas); + if (shadowp) + *shadowp = shadow; } /** * __swap_cache_del_folio - Removes a folio from the swap cache. + * @ci: The locked swap cluster. * @folio: The folio. * @entry: The first swap entry that the folio corresponds to. * @shadow: shadow value to be filled in the swap cache. @@ -181,34 +180,36 @@ unlock: * Removes a folio from the swap cache and fills a shadow in place. * This won't put the folio's refcount. The caller has to do that. * - * Context: Caller must hold the xa_lock, ensure the folio is - * locked and in the swap cache, using the index of @entry. + * Context: Caller must ensure the folio is locked and in the swap cache + * using the index of @entry, and lock the cluster that holds the entries. */ -void __swap_cache_del_folio(struct folio *folio, +void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { - struct address_space *address_space = swap_address_space(entry); - int i; - long nr = folio_nr_pages(folio); - pgoff_t idx = swap_cache_index(entry); - XA_STATE(xas, &address_space->i_pages, idx); - - xas_set_update(&xas, workingset_update_node); - - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); - - for (i = 0; i < nr; i++) { - void *entry = xas_store(&xas, shadow); - VM_BUG_ON_PAGE(entry != folio, entry); - xas_next(&xas); - } + unsigned long old_tb, new_tb; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); + + new_tb = shadow_swp_to_tb(shadow); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + do { + /* If shadow is NULL, we sets an empty shadow */ + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || + swp_tb_to_folio(old_tb) != folio); + } while (++ci_off < ci_end); + folio->swap.val = 0; folio_clear_swapcache(folio); - address_space->nrpages -= nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); + node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); } /** @@ -223,12 +224,12 @@ void __swap_cache_del_folio(struct folio *folio, */ void swap_cache_del_folio(struct folio *folio) { + struct swap_cluster_info *ci; swp_entry_t entry = folio->swap; - struct address_space *address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - __swap_cache_del_folio(folio, entry, NULL); - xa_unlock_irq(&address_space->i_pages); + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + __swap_cache_del_folio(ci, folio, entry, NULL); + swap_cluster_unlock(ci); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); @@ -236,6 +237,7 @@ void swap_cache_del_folio(struct folio *folio) /** * __swap_cache_replace_folio - Replace a folio in the swap cache. + * @ci: The locked swap cluster. * @old: The old folio to be replaced. * @new: The new folio. * @@ -244,65 +246,62 @@ void swap_cache_del_folio(struct folio *folio) * entries. Replacement will take the new folio's swap entry value as * the starting offset to override all slots covered by the new folio. * - * Context: Caller must ensure both folios are locked, also lock the - * swap address_space that holds the old folio to avoid races. + * Context: Caller must ensure both folios are locked, and lock the + * cluster that holds the old folio to be replaced. */ -void __swap_cache_replace_folio(struct folio *old, struct folio *new) +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { swp_entry_t entry = new->swap; unsigned long nr_pages = folio_nr_pages(new); - unsigned long offset = swap_cache_index(entry); - unsigned long end = offset + nr_pages; - - XA_STATE(xas, &swap_address_space(entry)->i_pages, offset); + unsigned int ci_off = swp_cluster_offset(entry); + unsigned int ci_end = ci_off + nr_pages; + unsigned long old_tb, new_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ + new_tb = folio_to_swp_tb(new); do { - WARN_ON_ONCE(xas_store(&xas, new) != old); - xas_next(&xas); - } while (++offset < end); + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); + } while (++ci_off < ci_end); + + /* + * If the old folio is partially replaced (e.g., splitting a large + * folio, the old folio is shrunk, and new split sub folios replace + * the shrunk part), ensure the new folio doesn't overlap it. + */ + if (IS_ENABLED(CONFIG_DEBUG_VM) && + folio_order(old) != folio_order(new)) { + ci_off = swp_cluster_offset(old->swap); + ci_end = ci_off + folio_nr_pages(old); + while (ci_off++ < ci_end) + WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); + } } /** * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. - * @type: Indicates the swap device. - * @begin: Beginning offset of the range. - * @end: Ending offset of the range. + * @entry: The starting index entry. + * @nr_ents: How many slots need to be cleared. * - * Context: Caller must ensure the range is valid and hold a reference to - * the swap device. + * Context: Caller must ensure the range is valid, all in one single cluster, + * not occupied by any folio, and lock the cluster. */ -void swap_cache_clear_shadow(int type, unsigned long begin, - unsigned long end) +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) { - unsigned long curr = begin; - void *old; - - for (;;) { - swp_entry_t entry = swp_entry(type, curr); - unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; - struct address_space *address_space = swap_address_space(entry); - XA_STATE(xas, &address_space->i_pages, index); - - xas_set_update(&xas, workingset_update_node); - - xa_lock_irq(&address_space->i_pages); - xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { - if (!xa_is_value(old)) - continue; - xas_store(&xas, NULL); - } - xa_unlock_irq(&address_space->i_pages); + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + unsigned int ci_off = swp_cluster_offset(entry), ci_end; + unsigned long old; - /* search the next swapcache until we meet end */ - curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); - if (curr > end) - break; - } + ci_end = ci_off + nr_ents; + do { + old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); + WARN_ON_ONCE(swp_tb_is_folio(old)); + } while (++ci_off < ci_end); } /* @@ -482,10 +481,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; - /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (swap_cache_add_folio(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) - goto fail_unlock; - + swap_cache_add_folio(new_folio, entry, &shadow); memcg1_swapin(entry, 1); if (shadow) @@ -677,41 +673,6 @@ skip: return folio; } -int init_swap_address_space(unsigned int type, unsigned long nr_pages) -{ - struct address_space *spaces, *space; - unsigned int i, nr; - - nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); - spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); - if (!spaces) - return -ENOMEM; - for (i = 0; i < nr; i++) { - space = spaces + i; - xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); - atomic_set(&space->i_mmap_writable, 0); - space->a_ops = &swap_aops; - /* swap cache doesn't use writeback related tags */ - mapping_set_no_writeback_tags(space); - } - nr_swapper_spaces[type] = nr; - swapper_spaces[type] = spaces; - - return 0; -} - -void exit_swap_address_space(unsigned int type) -{ - int i; - struct address_space *spaces = swapper_spaces[type]; - - for (i = 0; i < nr_swapper_spaces[type]; i++) - VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); - kvfree(spaces); - nr_swapper_spaces[type] = 0; - swapper_spaces[type] = NULL; -} - static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, unsigned long *end) { @@ -884,7 +845,7 @@ static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; -static int __init swap_init_sysfs(void) +static int __init swap_init(void) { int err; struct kobject *swap_kobj; @@ -899,11 +860,13 @@ static int __init swap_init_sysfs(void) pr_err("failed to register swap group\n"); goto delete_obj; } + /* Swap cache writeback is LRU based, no tags for it */ + mapping_set_no_writeback_tags(&swap_space); return 0; delete_obj: kobject_put(swap_kobj); return err; } -subsys_initcall(swap_init_sysfs); +subsys_initcall(swap_init); #endif diff --git a/mm/swap_table.h b/mm/swap_table.h new file mode 100644 index 000000000000..e1f7cc009701 --- /dev/null +++ b/mm/swap_table.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_SWAP_TABLE_H +#define _MM_SWAP_TABLE_H + +#include "swap.h" + +/* + * A swap table entry represents the status of a swap slot on a swap + * (physical or virtual) device. The swap table in each cluster is a + * 1:1 map of the swap slots in this cluster. + * + * Each swap table entry could be a pointer (folio), a XA_VALUE + * (shadow), or NULL. + */ + +/* + * Helpers for casting one type of info into a swap table entry. + */ +static inline unsigned long null_to_swp_tb(void) +{ + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t)); + return 0; +} + +static inline unsigned long folio_to_swp_tb(struct folio *folio) +{ + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); + return (unsigned long)folio; +} + +static inline unsigned long shadow_swp_to_tb(void *shadow) +{ + BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != + BITS_PER_BYTE * sizeof(unsigned long)); + VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); + return (unsigned long)shadow; +} + +/* + * Helpers for swap table entry type checking. + */ +static inline bool swp_tb_is_null(unsigned long swp_tb) +{ + return !swp_tb; +} + +static inline bool swp_tb_is_folio(unsigned long swp_tb) +{ + return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb); +} + +static inline bool swp_tb_is_shadow(unsigned long swp_tb) +{ + return xa_is_value((void *)swp_tb); +} + +/* + * Helpers for retrieving info from swap table. + */ +static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_folio(swp_tb)); + return (void *)swp_tb; +} + +static inline void *swp_tb_to_shadow(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); + return (void *)swp_tb; +} + +/* + * Helpers for accessing or modifying the swap table of a cluster, + * the swap cluster must be locked. + */ +static inline void __swap_table_set(struct swap_cluster_info *ci, + unsigned int off, unsigned long swp_tb) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + atomic_long_set(&ci->table[off], swp_tb); +} + +static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, + unsigned int off, unsigned long swp_tb) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + /* Ordering is guaranteed by cluster lock, relax */ + return atomic_long_xchg_relaxed(&ci->table[off], swp_tb); +} + +static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, + unsigned int off) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + return atomic_long_read(&ci->table[off]); +} +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 51f781c43537..b183e96be289 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -46,6 +46,7 @@ #include #include #include +#include "swap_table.h" #include "internal.h" #include "swap.h" @@ -421,6 +422,34 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } +static int swap_cluster_alloc_table(struct swap_cluster_info *ci) +{ + WARN_ON(ci->table); + ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL); + if (!ci->table) + return -ENOMEM; + return 0; +} + +static void swap_cluster_free_table(struct swap_cluster_info *ci) +{ + unsigned int ci_off; + unsigned long swp_tb; + + if (!ci->table) + return; + + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) { + swp_tb = __swap_table_get(ci, ci_off); + if (!swp_tb_is_null(swp_tb)) + pr_err_once("swap: unclean swap space on swapoff: 0x%lx", + swp_tb); + } + + kfree(ci->table); + ci->table = NULL; +} + static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) @@ -703,6 +732,26 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; } +/* + * Currently, the swap table is not used for count tracking, just + * do a sanity check here to ensure nothing leaked, so the swap + * table should be empty upon freeing. + */ +static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, + unsigned int start, unsigned int nr) +{ + unsigned int ci_off = start % SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr; + unsigned long swp_tb; + + if (IS_ENABLED(CONFIG_DEBUG_VM)) { + do { + swp_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + } while (++ci_off < ci_end); + } +} + static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) @@ -722,6 +771,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster ci->order = order; memset(si->swap_map + start, usage, nr_pages); + swap_cluster_assert_table_empty(ci, start, nr_pages); swap_range_alloc(si, nr_pages); ci->count += nr_pages; @@ -1124,7 +1174,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } - swap_cache_clear_shadow(si->type, begin, end); + __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 @@ -1281,16 +1331,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) if (!entry.val) return -ENOMEM; - /* - * XArray node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - if (swap_cache_add_folio(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) - goto out_free; + swap_cache_add_folio(folio, entry, NULL); return 0; @@ -1556,6 +1597,7 @@ static void swap_entries_free(struct swap_info_struct *si, mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); + swap_cluster_assert_table_empty(ci, offset, nr_pages); if (!ci->count) free_cluster(si, ci); @@ -2634,6 +2676,18 @@ static void wait_for_allocation(struct swap_info_struct *si) } } +static void free_cluster_info(struct swap_cluster_info *cluster_info, + unsigned long maxpages) +{ + int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + + if (!cluster_info) + return; + for (i = 0; i < nr_clusters; i++) + swap_cluster_free_table(&cluster_info[i]); + kvfree(cluster_info); +} + /* * Called after swap device's reference count is dead, so * neither scan nor allocation will use it. @@ -2768,12 +2822,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; cluster_info = p->cluster_info; + free_cluster_info(cluster_info, p->max); + p->max = 0; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); @@ -2784,10 +2839,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); - exit_swap_address_space(p->type); inode = mapping->host; @@ -3171,8 +3224,11 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (!cluster_info) goto err; - for (i = 0; i < nr_clusters; i++) + for (i = 0; i < nr_clusters; i++) { spin_lock_init(&cluster_info[i].lock); + if (swap_cluster_alloc_table(&cluster_info[i])) + goto err_free; + } if (!(si->flags & SWP_SOLIDSTATE)) { si->global_cluster = kmalloc(sizeof(*si->global_cluster), @@ -3233,9 +3289,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } return cluster_info; - err_free: - kvfree(cluster_info); + free_cluster_info(cluster_info, maxpages); err: return ERR_PTR(err); } @@ -3429,13 +3484,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - error = init_swap_address_space(si->type, maxpages); - if (error) - goto bad_swap_unlock_inode; - error = zswap_swapon(si->type, maxpages); if (error) - goto free_swap_address_space; + goto bad_swap_unlock_inode; /* * Flush any pending IO and dirty mappings before we start using this @@ -3470,8 +3521,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto out; free_swap_zswap: zswap_swapoff(si->type); -free_swap_address_space: - exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: @@ -3486,7 +3535,8 @@ bad_swap: spin_unlock(&swap_lock); vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); + if (cluster_info) + free_cluster_info(cluster_info, maxpages); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) diff --git a/mm/vmscan.c b/mm/vmscan.c index c79c6806560b..e170c12e2065 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -730,13 +730,18 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, { int refcount; void *shadow = NULL; + struct swap_cluster_info *ci; BUG_ON(!folio_test_locked(folio)); BUG_ON(mapping != folio_mapping(folio)); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + ci = swap_cluster_get_and_lock_irq(folio); + } else { spin_lock(&mapping->host->i_lock); - xa_lock_irq(&mapping->i_pages); + xa_lock_irq(&mapping->i_pages); + } + /* * The non racy check for a busy folio. * @@ -776,9 +781,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __swap_cache_del_folio(folio, swap, shadow); + __swap_cache_del_folio(ci, folio, swap, shadow); memcg1_swapout(folio, swap); - xa_unlock_irq(&mapping->i_pages); + swap_cluster_unlock_irq(ci); put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); @@ -816,9 +821,12 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, return 1; cannot_free: - xa_unlock_irq(&mapping->i_pages); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + swap_cluster_unlock_irq(ci); + } else { + xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); + } return 0; } -- cgit v1.2.3 From 6106864b878e1ce5ecab4b8ffffff85e9ec69b78 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 2 Sep 2025 08:36:11 +0000 Subject: maple_tree: remove lockdep_map_p typedef Having the ma_external_lock field exist when CONFIG_LOCKDEP=n isn't used anywhere, so just get rid of it. This also avoids generating a typedef called lockdep_map_p that could overlap with typedefs in other header files. Link: https://lkml.kernel.org/r/20250902-maple-lockdep-p-v1-1-3ae5a398a379@google.com Signed-off-by: Alice Ryhl Reviewed-by: Danilo Krummrich Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 05730171d201..47f9002ae92d 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -194,7 +194,6 @@ enum store_type { #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP -typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) @@ -207,7 +206,6 @@ typedef struct lockdep_map *lockdep_map_p; #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else -typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) @@ -230,8 +228,10 @@ typedef struct { /* nothing */ } lockdep_map_p; */ struct maple_tree { union { - spinlock_t ma_lock; - lockdep_map_p ma_external_lock; + spinlock_t ma_lock; +#ifdef CONFIG_LOCKDEP + struct lockdep_map *ma_external_lock; +#endif }; unsigned int ma_flags; void __rcu *ma_root; -- cgit v1.2.3 From 522abd92279a8ea55bcc687f77697d4c0aaba6c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Sep 2025 18:11:00 +0100 Subject: ptdesc: convert __page_flags to pt_flags Patch series "Some ptdesc cleanups". The first two patches here are preparation for splitting struct ptdesc from struct page and struct folio. I think their only dependency is on the memdesc_flags_t patches from August which is in mm-new. The third patch is just something I noticed while working on the code. This patch (of 3): Use the new memdesc_flags_t type to show that these are the same bits as page/folio/slab and thesefore have the zone/node/section information in them. Remove a use of ptdesc_folio() by converting pagetable_is_reserved() to use test_bit() directly. Link: https://lkml.kernel.org/r/20250908171104.2409217-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250908171104.2409217-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 ++++++- include/linux/mm_types.h | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index a6bfa46937a8..8dd71392eba7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2934,6 +2934,11 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU */ +enum pt_flags { + PT_reserved = PG_reserved, + /* High bits are used for zone/node/section */ +}; + static inline struct ptdesc *virt_to_ptdesc(const void *x) { return page_ptdesc(virt_to_page(x)); @@ -2951,7 +2956,7 @@ static inline void *ptdesc_address(const struct ptdesc *pt) static inline bool pagetable_is_reserved(struct ptdesc *pt) { - return folio_test_reserved(ptdesc_folio(pt)); + return test_bit(PT_reserved, &pt->pt_flags.f); } /** diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ff2b4e13215f..f048dc80646e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -524,7 +524,7 @@ FOLIO_MATCH(compound_head, _head_3); /** * struct ptdesc - Memory descriptor for page tables. - * @__page_flags: Same as page flags. Powerpc only. + * @pt_flags: enum pt_flags plus zone/node/section. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 gmap shadow pages * (which are not linked into the user page tables) and x86 @@ -546,7 +546,7 @@ FOLIO_MATCH(compound_head, _head_3); * understanding of the issues. */ struct ptdesc { - unsigned long __page_flags; + memdesc_flags_t pt_flags; union { struct rcu_head pt_rcu_head; @@ -584,7 +584,7 @@ struct ptdesc { #define TABLE_MATCH(pg, pt) \ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) -TABLE_MATCH(flags, __page_flags); +TABLE_MATCH(flags, pt_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); -- cgit v1.2.3 From f0c92726e89f5c6c092526787465617a68af154f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Sep 2025 18:11:01 +0100 Subject: ptdesc: remove references to folios from __pagetable_ctor() and pagetable_dtor() In preparation for splitting struct ptdesc from struct page and struct folio, remove mentions of struct folio from these functions. Introduce ptdesc_nr_pages() to avoid using lruvec_stat_add/sub_folio() Link: https://lkml.kernel.org/r/20250908171104.2409217-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8dd71392eba7..25f56e209ec8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2097,9 +2097,9 @@ static inline long folio_nr_pages(const struct folio *folio) * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ -static inline long compound_nr(struct page *page) +static inline long compound_nr(const struct page *page) { - struct folio *folio = (struct folio *)page; + const struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags.f)) return 1; @@ -3066,21 +3066,26 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ +static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc) +{ + return compound_nr(ptdesc_page(ptdesc)); +} + static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __SetPageTable(ptdesc_page(ptdesc)); + mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); + __ClearPageTable(ptdesc_page(ptdesc)); + mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) -- cgit v1.2.3 From 90ec2df9dd31653ceac4a35d2440b108bdf27550 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Sep 2025 18:11:02 +0100 Subject: ptdesc: remove ptdesc_to_virt() This has the same effect as ptdesc_address() so convert the callers to use that and delete the function. Add kernel-doc for ptdesc_address(). Link: https://lkml.kernel.org/r/20250908171104.2409217-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- arch/arm/mm/mmu.c | 2 +- arch/s390/mm/pgalloc.c | 6 +++--- include/linux/mm.h | 11 ++++++----- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index edb7f56b7c91..8bac96e205ac 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -737,7 +737,7 @@ static void *__init late_alloc(unsigned long sz) if (!ptdesc || !pagetable_pte_ctor(NULL, ptdesc)) BUG(); - return ptdesc_to_virt(ptdesc); + return ptdesc_address(ptdesc); } static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr, diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d2f6f1f6d2fc..f56ee9aeac83 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -21,7 +21,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (!ptdesc) return NULL; - table = ptdesc_to_virt(ptdesc); + table = ptdesc_address(ptdesc); __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } @@ -119,7 +119,7 @@ struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) ptdesc = pagetable_alloc(GFP_KERNEL, 0); if (ptdesc) { - table = (u64 *)ptdesc_to_virt(ptdesc); + table = (u64 *)ptdesc_address(ptdesc); __arch_set_page_dat(table, 1); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); @@ -146,7 +146,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) pagetable_free(ptdesc); return NULL; } - table = ptdesc_to_virt(ptdesc); + table = ptdesc_address(ptdesc); __arch_set_page_dat(table, 1); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); diff --git a/include/linux/mm.h b/include/linux/mm.h index 25f56e209ec8..da6e0abad2cb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2944,11 +2944,12 @@ static inline struct ptdesc *virt_to_ptdesc(const void *x) return page_ptdesc(virt_to_page(x)); } -static inline void *ptdesc_to_virt(const struct ptdesc *pt) -{ - return page_to_virt(ptdesc_page(pt)); -} - +/** + * ptdesc_address - Virtual address of page table. + * @pt: Page table descriptor. + * + * Return: The first byte of the page table described by @pt. + */ static inline void *ptdesc_address(const struct ptdesc *pt) { return folio_address(ptdesc_folio(pt)); -- cgit v1.2.3 From e7a5f249e6db3b41a4618763e0f840639f3578f4 Mon Sep 17 00:00:00 2001 From: Chanwon Park Date: Mon, 8 Sep 2025 19:04:10 +0900 Subject: mm: re-enable kswapd when memory pressure subsides or demotion is toggled If kswapd fails to reclaim pages from a node MAX_RECLAIM_RETRIES in a row, kswapd on that node gets disabled. That is, the system won't wakeup kswapd for that node until page reclamation is observed at least once. That reclamation is mostly done by direct reclaim, which in turn enables kswapd back. However, on systems with CXL memory nodes, workloads with high anon page usage can disable kswapd indefinitely, without triggering direct reclaim. This can be reproduced with following steps: numa node 0 (32GB memory, 48 CPUs) numa node 2~5 (512GB CXL memory, 128GB each) (numa node 1 is disabled) swap space 8GB 1) Set /sys/kernel/mm/demotion_enabled to 0. 2) Set /proc/sys/kernel/numa_balancing to 0. 3) Run a process that allocates and random accesses 500GB of anon pages. 4) Let the process exit normally. During 3), free memory on node 0 gets lower than low watermark, and kswapd runs and depletes swap space. Then, kswapd fails consecutively and gets disabled. Allocation afterwards happens on CXL memory, so node 0 never gains more memory pressure to trigger direct reclaim. After 4), kswapd on node 0 remains disabled, and tasks running on that node are unable to swap. If you turn on NUMA_BALANCING_MEMORY_TIERING and demotion now, it won't work properly since kswapd is disabled. To mitigate this problem, reset kswapd_failures to 0 on following conditions: a) ZONE_BELOW_HIGH bit of a zone in hopeless node with a fallback memory node gets cleared. b) demotion_enabled is changed from false to true. Rationale for a): ZONE_BELOW_HIGH bit being cleared might be a sign that the node may be reclaimable afterwards. This won't help much if the memory-hungry process keeps running without freeing anything, but at least the node will go back to reclaimable state when the process exits. Rationale for b): When demotion_enabled is false, kswapd can only reclaim anon pages by swapping them out to swap space. If demotion_enabled is turned on, kswapd can demote anon pages to another node for reclaiming. So, the original failure count for determining reclaimability is no longer valid. Since kswapd_failures resets may be missed by ++ operation, it is changed from int to atomic_t. [akpm@linux-foundation.org: tweak whitespace] Link: https://lkml.kernel.org/r/aL6qGi69jWXfPc4D@pcw-MS-7D22 Signed-off-by: Chanwon Park Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- mm/memory-tiers.c | 12 ++++++++++++ mm/page_alloc.c | 29 ++++++++++++++++++++++------- mm/show_mem.c | 3 ++- mm/vmscan.c | 14 +++++++------- mm/vmstat.c | 2 +- 6 files changed, 45 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6c4eae96160d..7fb7331c5725 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1440,7 +1440,7 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_highest_zoneidx; - int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 0382b6942b8b..0ea5c13f10a2 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -942,11 +942,23 @@ static ssize_t demotion_enabled_store(struct kobject *kobj, const char *buf, size_t count) { ssize_t ret; + bool before = numa_demotion_enabled; ret = kstrtobool(buf, &numa_demotion_enabled); if (ret) return ret; + /* + * Reset kswapd_failures statistics. They may no longer be + * valid since the policy for kswapd has changed. + */ + if (before == false && numa_demotion_enabled == true) { + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + atomic_set(&pgdat->kswapd_failures, 0); + } + return count; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df6df302d0c5..6ff9f17d5f4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2860,14 +2860,29 @@ static void free_frozen_page_commit(struct zone *zone, */ return; } + high = nr_pcp_high(pcp, zone, batch, free_high); - if (pcp->count >= high) { - free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), - pcp, pindex); - if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && - zone_watermark_ok(zone, 0, high_wmark_pages(zone), - ZONE_MOVABLE, 0)) - clear_bit(ZONE_BELOW_HIGH, &zone->flags); + if (pcp->count < high) + return; + + free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), + pcp, pindex); + if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && + zone_watermark_ok(zone, 0, high_wmark_pages(zone), + ZONE_MOVABLE, 0)) { + struct pglist_data *pgdat = zone->zone_pgdat; + clear_bit(ZONE_BELOW_HIGH, &zone->flags); + + /* + * Assume that memory pressure on this node is gone and may be + * in a reclaimable state. If a memory fallback node exists, + * direct reclaim may not have been triggered, causing a + * 'hopeless node' to stay in that state for a while. Let + * kswapd work again by resetting kswapd_failures. + */ + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES && + next_memory_node(pgdat->node_id) < MAX_NUMNODES) + atomic_set(&pgdat->kswapd_failures, 0); } } diff --git a/mm/show_mem.c b/mm/show_mem.c index 90a9a37116e7..3a4b5207635d 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -278,7 +278,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES), + str_yes_no(atomic_read(&pgdat->kswapd_failures) >= + MAX_RECLAIM_RETRIES), K(node_page_state(pgdat, NR_BALLOON_PAGES))); } diff --git a/mm/vmscan.c b/mm/vmscan.c index e170c12e2065..b2fc8b626d3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -518,7 +518,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; /* @@ -5101,7 +5101,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * blk_finish_plug(&plug); done: if (sc->nr_reclaimed > reclaimed) - pgdat->kswapd_failures = 0; + atomic_set(&pgdat->kswapd_failures, 0); } /****************************************************************************** @@ -6180,7 +6180,7 @@ again: * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) - pgdat->kswapd_failures = 0; + atomic_set(&pgdat->kswapd_failures, 0); else if (sc->cache_trim_mode) sc->cache_trim_mode_failed = 1; } @@ -6492,7 +6492,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) int i; bool wmark_ok; - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { @@ -6902,7 +6902,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { @@ -7170,7 +7170,7 @@ restart: } if (!sc.nr_reclaimed) - pgdat->kswapd_failures++; + atomic_inc(&pgdat->kswapd_failures); out: clear_reclaim_active(pgdat, highest_zoneidx); @@ -7429,7 +7429,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; /* Hopeless node, leave it to direct reclaim if possible */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* diff --git a/mm/vmstat.c b/mm/vmstat.c index e522decf6a72..bb09c032eecf 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1848,7 +1848,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n node_unreclaimable: %u" "\n start_pfn: %lu", - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, + atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn); seq_putc(m, '\n'); } -- cgit v1.2.3 From 032c31127f27acb1b8152b512830ecef04ed2ebc Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 9 Sep 2025 13:13:57 -0700 Subject: mm: vm_event_item: explicit #include for THREAD_SIZE This header uses THREAD_SIZE, which is provided by the thread_info.h header but is not included in this header. Depending on the #include ordering in other files, this can produce preprocessor errors. Link: https://lkml.kernel.org/r/20250909201419.827638-1-briannorris@chromium.org Signed-off-by: Brian Norris Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9e15a088ba38..92f80b4d69a6 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -2,6 +2,8 @@ #ifndef VM_EVENT_ITEM_H_INCLUDED #define VM_EVENT_ITEM_H_INCLUDED +#include + #ifdef CONFIG_ZONE_DMA #define DMA_ZONE(xx) xx##_DMA, #else -- cgit v1.2.3 From fa17bcd5f65ed702df001579cca8c885fa6bf3e7 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 26 Aug 2025 11:37:21 -0400 Subject: mm: make folio page count functions return unsigned As raised by Andrew [1], a folio/compound page never spans a negative number of pages. Consequently, let's use "unsigned long" instead of "long" consistently for folio_nr_pages(), folio_large_nr_pages() and compound_nr(). Using "unsigned long" as return value is fine, because even "(long)-folio_nr_pages()" will keep on working as expected. Using "unsigned int" instead would actually break these use cases. This patch takes the first step changing these to return unsigned long (and making drm_gem_get_pages() use the new types instead of replacing min()). In the future, we might want to make more callers of these functions to consistently use "unsigned long". Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@linux-foundation.org/ Link: https://lkml.kernel.org/r/20250826153721.GA23292@cathedrallabs.org Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@linux-foundation.org/ [1] Signed-off-by: Aristeu Rozanski Suggested-by: Andrew Morton Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Simona Vetter Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- drivers/gpu/drm/drm_gem.c | 4 ++-- include/linux/mm.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 6a44351e58b7..5dbcf91ff73c 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -621,7 +621,7 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj) struct page **pages; struct folio *folio; struct folio_batch fbatch; - long i, j, npages; + unsigned long i, j, npages; if (WARN_ON(!obj->filp)) return ERR_PTR(-EINVAL); @@ -645,7 +645,7 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj) i = 0; while (i < npages) { - long nr; + unsigned long nr; folio = shmem_read_folio_gfp(mapping, i, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) diff --git a/include/linux/mm.h b/include/linux/mm.h index da6e0abad2cb..8f5b4df9b166 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1018,12 +1018,12 @@ static inline unsigned int folio_large_order(const struct folio *folio) } #ifdef NR_PAGES_IN_LARGE_FOLIO -static inline long folio_large_nr_pages(const struct folio *folio) +static inline unsigned long folio_large_nr_pages(const struct folio *folio) { return folio->_nr_pages; } #else -static inline long folio_large_nr_pages(const struct folio *folio) +static inline unsigned long folio_large_nr_pages(const struct folio *folio) { return 1L << folio_large_order(folio); } @@ -2062,7 +2062,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone, * * Return: A positive power of two. */ -static inline long folio_nr_pages(const struct folio *folio) +static inline unsigned long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; @@ -2097,7 +2097,7 @@ static inline long folio_nr_pages(const struct folio *folio) * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ -static inline long compound_nr(const struct page *page) +static inline unsigned long compound_nr(const struct page *page) { const struct folio *folio = (struct folio *)page; -- cgit v1.2.3 From 3a37469e5ac004905e4125bf60b43cc5216e83dc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Sep 2025 15:29:17 +0100 Subject: mm: constify compound_order() and page_size() Patch series "Small cleanups". These small cleanups can be applied now to reduce conflicts during the next merge window. They're all from various efforts to split struct page from other memdescs. Thanks to Vlastimil for the suggestion. This patch (of 3): These functions do not modify their arguments. Telling the compiler this may improve code generation, and allows us to pass const arguments from other functions. Link: https://lkml.kernel.org/r/20250910142923.2465470-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250910142923.2465470-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f5b4df9b166..fcb1e72eea40 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1036,9 +1036,9 @@ static inline unsigned long folio_large_nr_pages(const struct folio *folio) * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ -static inline unsigned int compound_order(struct page *page) +static inline unsigned int compound_order(const struct page *page) { - struct folio *folio = (struct folio *)page; + const struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags.f)) return 0; @@ -1256,7 +1256,7 @@ int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); /* Returns the number of bytes in this potentially compound page. */ -static inline unsigned long page_size(struct page *page) +static inline unsigned long page_size(const struct page *page) { return PAGE_SIZE << compound_order(page); } -- cgit v1.2.3 From 9d003dec972563efb8ce14c9962af3652d0e201d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Sep 2025 15:29:19 +0100 Subject: mm: remove page->order We already use page->private for storing the order of a page while it's in the buddy allocator system; extend that to also storing the order while it's in the pcp_llist. Link: https://lkml.kernel.org/r/20250910142923.2465470-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Alexei Starovoitov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 +++----- mm/page_alloc.c | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f048dc80646e..6920c816f6c6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -97,10 +97,7 @@ struct page { /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; - struct { - struct llist_node pcp_llist; - unsigned int order; - }; + struct llist_node pcp_llist; }; struct address_space *mapping; union { @@ -111,7 +108,8 @@ struct page { * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if swapcache flag set. - * Indicates order in the buddy system if PageBuddy. + * Indicates order in the buddy system if PageBuddy + * or on pcp_llist. */ unsigned long private; }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf38d499e045..2bfab96c207f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1520,7 +1520,7 @@ static void add_page_to_zone_llist(struct zone *zone, struct page *page, unsigned int order) { /* Remember the order */ - page->order = order; + page->private = order; /* Add the page to the free list */ llist_add(&page->pcp_llist, &zone->trylock_free_pages); } @@ -1549,7 +1549,7 @@ static void free_one_page(struct zone *zone, struct page *page, llnode = llist_del_all(llhead); llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) { - unsigned int p_order = p->order; + unsigned int p_order = p->private; split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags); __count_vm_events(PGFREE, 1 << p_order); -- cgit v1.2.3 From d02ac836e4d6bdfd7d44927d01a4cd048ad4aba8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 13 Sep 2025 17:03:39 -0700 Subject: include/linux/pgtable.h: convert arch_enter_lazy_mmu_mode() and friends to static inlines For all the usual reasons, plus a new one. Calling (void)arch_enter_lazy_mmu_mode(); deservedly blows up. Cc: Balbir Singh Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94249e671a7e..32e8457ad535 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -232,9 +232,9 @@ static inline int pmd_dirty(pmd_t pmd) * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) +static inline void arch_enter_lazy_mmu_mode(void) {} +static inline void arch_leave_lazy_mmu_mode(void) {} +static inline void arch_flush_lazy_mmu_mode(void) {} #endif #ifndef pte_batch_hint -- cgit v1.2.3 From 59d4d36158ba3cdbce141d8e9261eea154d4c441 Mon Sep 17 00:00:00 2001 From: zhongjinji Date: Tue, 16 Sep 2025 00:29:45 +0800 Subject: mm/oom_kill: thaw the entire OOM victim process Patch series "Improvements to Victim Process Thawing and OOM Reaper Traversal Order", v10. This patch series focuses on optimizing victim process thawing and refining the traversal order of the OOM reaper. Since __thaw_task() is used to thaw a single thread of the victim, thawing only one thread cannot guarantee the exit of the OOM victim when it is frozen. Patch 1 thaw the entire process of the OOM victim to ensure that OOM victims are able to terminate themselves. Even if the oom_reaper is delayed, patch 2 is still beneficial for reaping processes with a large address space footprint, and it also greatly improves process_mrelease. This patch (of 10): OOM killer is a mechanism that selects and kills processes when the system runs out of memory to reclaim resources and keep the system stable. But the oom victim cannot terminate on its own when it is frozen, even if the OOM victim task is thawed through __thaw_task(). This is because __thaw_task() can only thaw a single OOM victim thread, and cannot thaw the entire OOM victim process. In addition, freezing_slow_path() determines whether a task is an OOM victim by checking the task's TIF_MEMDIE flag. When a task is identified as an OOM victim, the freezer bypasses both PM freezing and cgroup freezing states to thaw it. Historically, TIF_MEMDIE was a "this is the oom victim & it has access to memory reserves" flag in the past. It has that thread vs. process problems and tsk_is_oom_victim was introduced later to get rid of them and other issues as well as the guarantee that we can identify the oom victim's mm reliably for other oom_reaper. Therefore, thaw_process() is introduced to unfreeze all threads within the OOM victim process, ensuring that every thread is properly thawed. The freezer now uses tsk_is_oom_victim() to determine OOM victim status, allowing all victim threads to be unfrozen as necessary. With this change, the entire OOM victim process will be thawed when an OOM event occurs, ensuring that the victim can terminate on its own. Link: https://lkml.kernel.org/r/20250915162946.5515-1-zhongjinji@honor.com Link: https://lkml.kernel.org/r/20250915162946.5515-2-zhongjinji@honor.com Signed-off-by: zhongjinji Reviewed-by: Suren Baghdasaryan Acked-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Liam R. Howlett Cc: David Rientjes Cc: Len Brown Cc: Lorenzo Stoakes Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- include/linux/freezer.h | 2 ++ kernel/freezer.c | 20 +++++++++++++++++++- mm/oom_kill.c | 10 +++++----- 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index b303472255be..32884c9721e5 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -47,6 +47,7 @@ extern int freeze_processes(void); extern int freeze_kernel_threads(void); extern void thaw_processes(void); extern void thaw_kernel_threads(void); +extern void thaw_process(struct task_struct *p); static inline bool try_to_freeze(void) { @@ -80,6 +81,7 @@ static inline int freeze_processes(void) { return -ENOSYS; } static inline int freeze_kernel_threads(void) { return -ENOSYS; } static inline void thaw_processes(void) {} static inline void thaw_kernel_threads(void) {} +static inline void thaw_process(struct task_struct *p) {} static inline bool try_to_freeze(void) { return false; } diff --git a/kernel/freezer.c b/kernel/freezer.c index 6a96149aede9..ddc11a8bd2ea 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -10,6 +10,7 @@ #include #include #include +#include #include /* total number of freezing conditions in effect */ @@ -40,7 +41,7 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; - if (test_tsk_thread_flag(p, TIF_MEMDIE)) + if (tsk_is_oom_victim(p)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) @@ -206,6 +207,23 @@ void __thaw_task(struct task_struct *p) wake_up_state(p, TASK_FROZEN); } +/* + * thaw_process - Thaw a frozen process + * @p: the process to be thawed + * + * Iterate over all threads of @p and call __thaw_task() on each. + */ +void thaw_process(struct task_struct *p) +{ + struct task_struct *t; + + rcu_read_lock(); + for_each_thread(p, t) { + __thaw_task(t); + } + rcu_read_unlock(); +} + /** * set_freezable - make %current freezable * diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 58bd4cf71d52..22caef65f1d0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -772,12 +772,12 @@ static void mark_oom_victim(struct task_struct *tsk) mmgrab(tsk->signal->oom_mm); /* - * Make sure that the task is woken up from uninterruptible sleep - * if it is frozen because OOM killer wouldn't be able to free - * any memory and livelock. freezing_slow_path will tell the freezer - * that TIF_MEMDIE tasks should be ignored. + * Make sure that the process is woken up from uninterruptible sleep + * if it is frozen because OOM killer wouldn't be able to free any + * memory and livelock. The freezer will thaw the tasks that are OOM + * victims regardless of the PM freezing and cgroup freezing states. */ - __thaw_task(tsk); + thaw_process(tsk); atomic_inc(&oom_victims); cred = get_task_cred(tsk); trace_mark_victim(tsk, cred->uid.val); -- cgit v1.2.3 From b9e2f58ffb84afcbba7e66f96ca14f98e0e88f26 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 15 Sep 2025 16:02:24 -0700 Subject: alloc_tag: mark inaccurate allocation counters in /proc/allocinfo output While rare, memory allocation profiling can contain inaccurate counters if slab object extension vector allocation fails. That allocation might succeed later but prior to that, slab allocations that would have used that object extension vector will not be accounted for. To indicate incorrect counters, "accurate:no" marker is appended to the call site line in the /proc/allocinfo output. Bump up /proc/allocinfo version to reflect the change in the file format and update documentation. Example output with invalid counters: allocinfo - version: 2.0 0 0 arch/x86/kernel/kdebugfs.c:105 func:create_setup_data_nodes 0 0 arch/x86/kernel/alternative.c:2090 func:alternatives_smp_module_add 0 0 arch/x86/kernel/alternative.c:127 func:__its_alloc accurate:no 0 0 arch/x86/kernel/fpu/regset.c:160 func:xstateregs_set 0 0 arch/x86/kernel/fpu/xstate.c:1590 func:fpstate_realloc 0 0 arch/x86/kernel/cpu/aperfmperf.c:379 func:arch_enable_hybrid_capacity_scale 0 0 arch/x86/kernel/cpu/amd_cache_disable.c:258 func:init_amd_l3_attrs 49152 48 arch/x86/kernel/cpu/mce/core.c:2709 func:mce_device_create accurate:no 32768 1 arch/x86/kernel/cpu/mce/genpool.c:132 func:mce_gen_pool_create 0 0 arch/x86/kernel/cpu/mce/amd.c:1341 func:mce_threshold_create_device [surenb@google.com: document new "accurate:no" marker] Fixes: 39d117e04d15 ("alloc_tag: mark inaccurate allocation counters in /proc/allocinfo output") [akpm@linux-foundation.org: simplification per Usama, reflow text] [akpm@linux-foundation.org: add newline to prevent docs warning, per Randy] Link: https://lkml.kernel.org/r/20250915230224.4115531-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Usama Arif Acked-by: Johannes Weiner Cc: David Rientjes Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Roman Gushchin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 13 +++++++++++++ include/linux/alloc_tag.h | 12 ++++++++++++ include/linux/codetag.h | 5 ++++- lib/alloc_tag.c | 4 +++- mm/slub.c | 2 ++ 5 files changed, 34 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 915a3e44bc12..6ffd04c736db 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1009,6 +1009,19 @@ number, module (if originates from a loadable module) and the function calling the allocation. The number of bytes allocated and number of calls at each location are reported. The first line indicates the version of the file, the second line is the header listing fields in the file. +If file version is 2.0 or higher then each line may contain additional +: pairs representing extra information about the call site. +For example if the counters are not accurate, the line will be appended with +"accurate:no" pair. + +Supported markers in v2: +accurate:no + + Absolute values of the counters in this line are not accurate + because of the failure to allocate memory to track some of the + allocations made at this location. Deltas in these counters are + accurate, therefore counters can be used to track allocation size + and count changes. Example output. diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 9ef2633e2c08..d40ac39bfbe8 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -221,6 +221,16 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) ref->ct = NULL; } +static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag) +{ + tag->ct.flags |= CODETAG_FLAG_INACCURATE; +} + +static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag) +{ + return !!(tag->ct.flags & CODETAG_FLAG_INACCURATE); +} + #define alloc_tag_record(p) ((p) = current->alloc_tag) #else /* CONFIG_MEM_ALLOC_PROFILING */ @@ -230,6 +240,8 @@ static inline bool mem_alloc_profiling_enabled(void) { return false; } static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) {} static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} +static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag) {} +static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag) { return false; } #define alloc_tag_record(p) do {} while (0) #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/include/linux/codetag.h b/include/linux/codetag.h index 457ed8fd3214..8ea2a5f7c98a 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -16,13 +16,16 @@ struct module; #define CODETAG_SECTION_START_PREFIX "__start_" #define CODETAG_SECTION_STOP_PREFIX "__stop_" +/* codetag flags */ +#define CODETAG_FLAG_INACCURATE (1 << 0) + /* * An instance of this structure is created in a special ELF section at every * code location being tagged. At runtime, the special section is treated as * an array of these. */ struct codetag { - unsigned int flags; /* used in later patches */ + unsigned int flags; unsigned int lineno; const char *modname; const char *function; diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index f79217427d81..3ef702e6b69a 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -80,7 +80,7 @@ static void allocinfo_stop(struct seq_file *m, void *arg) static void print_allocinfo_header(struct seq_buf *buf) { /* Output format version, so we can change it. */ - seq_buf_printf(buf, "allocinfo - version: 1.0\n"); + seq_buf_printf(buf, "allocinfo - version: 2.0\n"); seq_buf_printf(buf, "# \n"); } @@ -92,6 +92,8 @@ static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls); codetag_to_text(out, ct); + if (unlikely(alloc_tag_is_inaccurate(tag))) + seq_buf_printf(out, " accurate:no"); seq_buf_putc(out, ' '); seq_buf_putc(out, '\n'); } diff --git a/mm/slub.c b/mm/slub.c index af343ca570b5..9c04f29ee8de 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2143,6 +2143,8 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) */ if (likely(obj_exts)) alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); + else + alloc_tag_set_inaccurate(current->alloc_tag); } static inline void -- cgit v1.2.3 From ab152db3cae520154d572cff32e63de441672454 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:35:05 -0700 Subject: mm/damon/core: implement damon_initialized() function Patch series "mm/damon: define and use DAMON initialization check function". DAMON is initialized in subsystem initialization time, by damon_init(). If DAMON API functions are called before the initialization, the system could crash. Actually such issues happened and were fixed [1] in the past. For the fix, DAMON API callers have updated to check if DAMON is initialized or not, using their own hacks. The hacks are unnecessarily duplicated on every DAMON API callers and therefore it would be difficult to reliably maintain in the long term. Make it reliable and easy to maintain. For this, implement a new DAMON core layer API function that returns if DAMON is successfully initialized. If it returns true, it means DAMON API functions are safe to be used. After the introduction of the new API, update DAMON API callers to use the new function instead of their own hacks. This patch (of 7): If DAMON is tried to be used when it is not yet successfully initialized, the caller could be crashed. DAMON core layer is not providing a reliable way to see if it is successfully initialized and therefore ready to be used, though. As a result, DAMON API callers are implementing their own hacks to see it. The hacks simply assume DAMON should be ready on module init time. It is not reliable as DAMON initialization can indeed fail if KMEM_CACHE() fails, and difficult to maintain as those are duplicates. Implement a core layer API function for better reliability and maintainability to replace the hacks with followup commits. Link: https://lkml.kernel.org/r/20250916033511.116366-2-sj@kernel.org Link: https://lkml.kernel.org/r/20250916033511.116366-2-sj@kernel.org Link: https://lore.kernel.org/20250909022238.2989-1-sj@kernel.org [1] Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 10 ++++++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index aa7381be388c..cae8c613c5fc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -938,6 +938,7 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs } +bool damon_initialized(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); bool damon_is_running(struct damon_ctx *ctx); diff --git a/mm/damon/core.c b/mm/damon/core.c index 775121ae7a9b..93848b4c6944 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2863,6 +2863,16 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, r->nr_accesses++; } +/** + * damon_initialized() - Return if DAMON is ready to be used. + * + * Return: true if DAMON is ready to be used, false otherwise. + */ +bool damon_initialized(void) +{ + return damon_region_cache != NULL; +} + static int __init damon_init(void) { damon_region_cache = KMEM_CACHE(damon_region, 0); -- cgit v1.2.3 From a05e4e935a6689542d86162b33a484cc704ce39a Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Fri, 29 Aug 2025 17:09:44 +0200 Subject: virtio_config: clarify output parameters This was ambiguous enough for a broken patch (206cc44588f7 ("virtio: reject shm region if length is zero")) to make it into the kernel, so make it clearer. Link: https://lore.kernel.org/r/20250816071600-mutt-send-email-mst@kernel.org/ Signed-off-by: Alyssa Ross Message-Id: <20250829150944.233505-1-hi@alyssa.is> Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8bf156dde554..7427b79d6f3d 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -193,14 +193,15 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, } static inline void virtio_get_features(struct virtio_device *vdev, - u64 *features) + u64 *features_out) { if (vdev->config->get_extended_features) { - vdev->config->get_extended_features(vdev, features); + vdev->config->get_extended_features(vdev, features_out); return; } - virtio_features_from_u64(features, vdev->config->get_features(vdev)); + virtio_features_from_u64(features_out, + vdev->config->get_features(vdev)); } /** @@ -326,11 +327,11 @@ int virtqueue_set_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) static inline bool virtio_get_shm_region(struct virtio_device *vdev, - struct virtio_shm_region *region, u8 id) + struct virtio_shm_region *region_out, u8 id) { if (!vdev->config->get_shm_region) return false; - return vdev->config->get_shm_region(vdev, region, id); + return vdev->config->get_shm_region(vdev, region_out, id); } static inline bool virtio_is_little_endian(struct virtio_device *vdev) -- cgit v1.2.3 From 2ee3a75e42081db3d951c0893f5d654f16d1c0e8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 18 Jul 2025 11:26:15 +1000 Subject: nfsd: discard nfsd_file_get_local() This interface was deprecated by commit e6f7e1487ab5 ("nfs_localio: simplify interface to nfsd for getting nfsd_file") and is now unused. So let's remove it. Signed-off-by: NeilBrown Reviewed-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 21 --------------------- fs/nfsd/filecache.h | 1 - fs/nfsd/localio.c | 1 - include/linux/nfslocalio.h | 1 - 4 files changed, 24 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 732abf6b92a5..75bc48031c07 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -391,27 +391,6 @@ nfsd_file_put_local(struct nfsd_file __rcu **pnf) return net; } -/** - * nfsd_file_get_local - get nfsd_file reference and reference to net - * @nf: nfsd_file of which to put the reference - * - * Get reference to both the nfsd_file and nf->nf_net. - */ -struct nfsd_file * -nfsd_file_get_local(struct nfsd_file *nf) -{ - struct net *net = nf->nf_net; - - if (nfsd_net_try_get(net)) { - nf = nfsd_file_get(nf); - if (!nf) - nfsd_net_put(net); - } else { - nf = NULL; - } - return nf; -} - /** * nfsd_file_file - get the backing file of an nfsd_file * @nf: nfsd_file of which to access the backing file. diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 722b26c71e45..24ddf60e8434 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -63,7 +63,6 @@ int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf); -struct nfsd_file *nfsd_file_get_local(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); struct file *nfsd_file_file(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index cb237f1b902a..269fa9391dc4 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -122,7 +122,6 @@ static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_net_put = nfsd_net_put, .nfsd_open_local_fh = nfsd_open_local_fh, .nfsd_file_put_local = nfsd_file_put_local, - .nfsd_file_get_local = nfsd_file_get_local, .nfsd_file_file = nfsd_file_file, }; diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 5c7c92659e73..59ea90bd136b 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -63,7 +63,6 @@ struct nfsd_localio_operations { struct nfsd_file __rcu **pnf, const fmode_t); struct net *(*nfsd_file_put_local)(struct nfsd_file __rcu **); - struct nfsd_file *(*nfsd_file_get_local)(struct nfsd_file *); struct file *(*nfsd_file_file)(struct nfsd_file *); } ____cacheline_aligned; -- cgit v1.2.3 From c97b737ef8f10f28424822c139e3b22b9e9bcc2b Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Fri, 18 Jul 2025 11:09:56 +0300 Subject: sunrpc: Change ret code of xdr_stream_decode_opaque_fixed Since the opaque is fixed in size, the caller already knows how many bytes were decoded, on success. Thus, xdr_stream_decode_opaque_fixed() doesn't need to return that value. And, xdr_stream_decode_u32 and _u64 both return zero on success. This patch simplifies the caller's error checking to avoid potential integer promotion issues. Suggested-by: Dan Carpenter Signed-off-by: Sergey Bashirov Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 4 ++-- .../sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 8a9ec617cf66..8d354015d762 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -721,7 +721,7 @@ xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) * @len: size of buffer pointed to by @ptr * * Return values: - * On success, returns size of object stored in @ptr + * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t @@ -732,7 +732,7 @@ xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) if (unlikely(!p)) return -EBADMSG; xdr_decode_opaque_fixed(p, ptr, len); - return len; + return 0; } /** diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 index 8b4ff08c49e5..bdc7bd24ffb1 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 @@ -13,5 +13,5 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr {% if annotate %} /* (fixed-length opaque) */ {% endif %} - return xdr_stream_decode_opaque_fixed(xdr, ptr, {{ size }}) >= 0; + return xdr_stream_decode_opaque_fixed(xdr, ptr, {{ size }}) == 0; }; -- cgit v1.2.3 From afc5b36e29b95fbd31a60b9630d148857e5e513d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Jul 2025 09:24:32 -0400 Subject: vfs: add ATTR_CTIME_SET flag When ATTR_ATIME_SET and ATTR_MTIME_SET are set in the ia_valid mask, the notify_change() logic takes that to mean that the request should set those values explicitly, and not override them with "now". With the advent of delegated timestamps, similar functionality is needed for the ctime. Add a ATTR_CTIME_SET flag, and use that to indicate that the ctime should be accepted as-is. Also, clean up the if statements to eliminate the extra negatives. In setattr_copy() and setattr_copy_mgtime() use inode_set_ctime_deleg() when ATTR_CTIME_SET is set, instead of basing the decision on ATTR_DELEG. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/attr.c | 44 +++++++++++++++++++------------------------- include/linux/fs.h | 1 + 2 files changed, 20 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/fs/attr.c b/fs/attr.c index 5425c1dbbff9..795f231d00e8 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -286,20 +286,12 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) unsigned int ia_valid = attr->ia_valid; struct timespec64 now; - if (ia_valid & ATTR_CTIME) { - /* - * In the case of an update for a write delegation, we must respect - * the value in ia_ctime and not use the current time. - */ - if (ia_valid & ATTR_DELEG) - now = inode_set_ctime_deleg(inode, attr->ia_ctime); - else - now = inode_set_ctime_current(inode); - } else { - /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */ - WARN_ON_ONCE(ia_valid & ATTR_MTIME); + if (ia_valid & ATTR_CTIME_SET) + now = inode_set_ctime_deleg(inode, attr->ia_ctime); + else if (ia_valid & ATTR_CTIME) + now = inode_set_ctime_current(inode); + else now = current_time(inode); - } if (ia_valid & ATTR_ATIME_SET) inode_set_atime_to_ts(inode, attr->ia_atime); @@ -359,12 +351,11 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) { - if (ia_valid & ATTR_DELEG) - inode_set_ctime_deleg(inode, attr->ia_ctime); - else - inode_set_ctime_to_ts(inode, attr->ia_ctime); - } + + if (ia_valid & ATTR_CTIME_SET) + inode_set_ctime_deleg(inode, attr->ia_ctime); + else if (ia_valid & ATTR_CTIME) + inode_set_ctime_to_ts(inode, attr->ia_ctime); } EXPORT_SYMBOL(setattr_copy); @@ -463,15 +454,18 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, now = current_time(inode); - attr->ia_ctime = now; - if (!(ia_valid & ATTR_ATIME_SET)) - attr->ia_atime = now; - else + if (ia_valid & ATTR_ATIME_SET) attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); - if (!(ia_valid & ATTR_MTIME_SET)) - attr->ia_mtime = now; else + attr->ia_atime = now; + if (ia_valid & ATTR_CTIME_SET) + attr->ia_ctime = timestamp_truncate(attr->ia_ctime, inode); + else + attr->ia_ctime = now; + if (ia_valid & ATTR_MTIME_SET) attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); + else + attr->ia_mtime = now; if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry); diff --git a/include/linux/fs.h b/include/linux/fs.h index 601d036a6c78..74f2bfc51926 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -238,6 +238,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ +#define ATTR_CTIME_SET (1 << 10) #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) -- cgit v1.2.3 From 898374fdd7f06fa4c4a66e8be3135efeae6128d5 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 19 Aug 2025 14:04:02 -0400 Subject: nfsd: unregister with rpcbind when deleting a transport When a listener is added, a part of creation of transport also registers program/port with rpcbind. However, when the listener is removed, while transport goes away, rpcbind still has the entry for that port/type. When deleting the transport, unregister with rpcbind when appropriate. ---v2 created a new xpt_flag XPT_RPCB_UNREG to mark TCP and UDP transport and at xprt destroy send rpcbind unregister if flag set. Suggested-by: Chuck Lever Fixes: d093c9089260 ("nfsd: fix management of listener transports") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_xprt.h | 3 +++ net/sunrpc/svc_xprt.c | 13 +++++++++++++ net/sunrpc/svcsock.c | 2 ++ 3 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 369a89aea186..2b886f7eb295 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -104,6 +104,9 @@ enum { * it has access to. It is NOT counted * in ->sv_tmpcnt. */ + XPT_RPCB_UNREG, /* transport that needs unregistering + * with rpcbind (TCP, UDP) on destroy + */ }; /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 8b1837228799..b800d704d807 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1014,6 +1014,19 @@ static void svc_delete_xprt(struct svc_xprt *xprt) struct svc_serv *serv = xprt->xpt_server; struct svc_deferred_req *dr; + /* unregister with rpcbind for when transport type is TCP or UDP. + */ + if (test_bit(XPT_RPCB_UNREG, &xprt->xpt_flags)) { + struct svc_sock *svsk = container_of(xprt, struct svc_sock, + sk_xprt); + struct socket *sock = svsk->sk_sock; + + if (svc_register(serv, xprt->xpt_net, sock->sk->sk_family, + sock->sk->sk_protocol, 0) < 0) + pr_warn("failed to unregister %s with rpcbind\n", + xprt->xpt_class->xcl_name); + } + if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) return; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c0d5a27ba674..7b90abc5cf0e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -836,6 +836,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* data might have come in before data_ready set up */ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { @@ -1350,6 +1351,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) if (sk->sk_state == TCP_LISTEN) { strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { -- cgit v1.2.3 From d73d06dac604043b94a5f18ebb6a69da1b867702 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Aug 2025 10:27:28 -0400 Subject: SUNRPC: Move the svc_rpcb_cleanup() call sites Clean up: because svc_rpcb_cleanup() and svc_xprt_destroy_all() are always invoked in pairs, we can deduplicate code by moving the svc_rpcb_cleanup() call sites into svc_xprt_destroy_all(). Tested-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 6 ++---- fs/nfs/callback.c | 2 +- fs/nfsd/nfsctl.c | 2 +- fs/nfsd/nfssvc.c | 7 ++----- include/linux/sunrpc/svc_xprt.h | 3 ++- net/sunrpc/svc.c | 1 - net/sunrpc/svc_xprt.c | 7 ++++++- 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e80262a51884..d68afa196535 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -216,8 +216,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); return err; } @@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); } } else { pr_err("%s: no users! net=%x\n", diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 511f80878809..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc6b776fc657..63d52edcad72 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1993,7 +1993,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * remaining listeners and recreate the list. */ if (delete) - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 82b0111ac469..7057ddd7a0a8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net) #endif } - svc_xprt_destroy_all(serv, net); - /* * write_ports can create the server without actually starting - * any threads--if we get shut down before any threads are + * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ - svc_rpcb_cleanup(serv, net); - + svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 2b886f7eb295..da2a2531e110 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -168,7 +168,8 @@ int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred); -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net); +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister); void svc_xprt_received(struct svc_xprt *xprt); void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_put(struct svc_xprt *xprt); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index fc70e13b1cb9..cb4010e2dc0c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) svc_unregister(serv, net); rpcb_put_local(net); } -EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b800d704d807..6973184ff667 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1115,6 +1115,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * svc_xprt_destroy_all - Destroy transports associated with @serv * @serv: RPC service to be shut down * @net: target network namespace + * @unregister: true if it is OK to unregister the destroyed xprts * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). @@ -1127,7 +1128,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister) { int delay = 0; @@ -1137,6 +1139,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) svc_clean_up_xprts(serv, net); msleep(delay++); } + + if (unregister) + svc_rpcb_cleanup(serv, net); } EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); -- cgit v1.2.3 From 099f942182e3695554cba44e4bafb08a4111b50f Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:20 +0200 Subject: spi: keep track of number of chipselects in spi_device There are several places where we need to iterate over a device's chipselect. To be able to do it efficiently, store the number of chipselects in spi_device, like we do for controllers. Since we now use a device supplied value, add a check to make sure it isn't more than we can support. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-3-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 29 +++++++++++++++++++++-------- include/linux/spi/spi.h | 4 +++- 2 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index b07d6cdf587c..6598fb862d80 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -586,6 +586,7 @@ struct spi_device *spi_alloc_device(struct spi_controller *ctlr) spi->dev.bus = &spi_bus_type; spi->dev.release = spidev_release; spi->mode = ctlr->buswidth_override_bits; + spi->num_chipselect = 1; device_initialize(&spi->dev); return spi; @@ -635,7 +636,7 @@ static inline int spi_dev_check_cs(struct device *dev, u8 idx_new; cs = spi_get_chipselect(spi, idx); - for (idx_new = new_idx; idx_new < SPI_CS_CNT_MAX; idx_new++) { + for (idx_new = new_idx; idx_new < new_spi->num_chipselect; idx_new++) { cs_new = spi_get_chipselect(new_spi, idx_new); if (is_valid_cs(cs) && is_valid_cs(cs_new) && cs == cs_new) { dev_err(dev, "chipselect %u already in use\n", cs_new); @@ -652,7 +653,7 @@ static int spi_dev_check(struct device *dev, void *data) int status, idx; if (spi->controller == new_spi->controller) { - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { status = spi_dev_check_cs(dev, spi, idx, new_spi, 0); if (status) return status; @@ -674,7 +675,13 @@ static int __spi_add_device(struct spi_device *spi) int status, idx; u8 cs; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (spi->num_chipselect > SPI_CS_CNT_MAX) { + dev_err(dev, "num_cs %d > max %d\n", spi->num_chipselect, + SPI_CS_CNT_MAX); + return -EOVERFLOW; + } + + for (idx = 0; idx < spi->num_chipselect; idx++) { /* Chipselects are numbered 0..max; validate. */ cs = spi_get_chipselect(spi, idx); if (is_valid_cs(cs) && cs >= ctlr->num_chipselect) { @@ -689,7 +696,7 @@ static int __spi_add_device(struct spi_device *spi) * For example, spi->chip_select[0] != spi->chip_select[1] and so on. */ if (!spi_controller_is_target(ctlr)) { - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { status = spi_dev_check_cs(dev, spi, idx, spi, idx + 1); if (status) return status; @@ -717,7 +724,7 @@ static int __spi_add_device(struct spi_device *spi) if (ctlr->cs_gpiods) { u8 cs; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { cs = spi_get_chipselect(spi, idx); if (is_valid_cs(cs)) spi_set_csgpiod(spi, idx, ctlr->cs_gpiods[cs]); @@ -1024,7 +1031,7 @@ static void spi_res_release(struct spi_controller *ctlr, struct spi_message *mes /*-------------------------------------------------------------------------*/ #define spi_for_each_valid_cs(spi, idx) \ - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) \ + for (idx = 0; idx < spi->num_chipselect; idx++) \ if (!(spi->cs_index_mask & BIT(idx))) {} else static inline bool spi_is_last_cs(struct spi_device *spi) @@ -1080,8 +1087,12 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force) trace_spi_set_cs(spi, activate); spi->controller->last_cs_index_mask = spi->cs_index_mask; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) - spi->controller->last_cs[idx] = enable ? spi_get_chipselect(spi, 0) : SPI_INVALID_CS; + for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + if (enable && idx < spi->num_chipselect) + spi->controller->last_cs[idx] = spi_get_chipselect(spi, 0); + else + spi->controller->last_cs[idx] = SPI_INVALID_CS; + } spi->controller->last_cs_mode_high = spi->mode & SPI_CS_HIGH; if (spi->controller->last_cs_mode_high) @@ -2452,6 +2463,8 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, dev_err(&ctlr->dev, "SPI controller doesn't support multi CS\n"); return -EINVAL; } + + spi->num_chipselect = rc; for (idx = 0; idx < rc; idx++) spi_set_chipselect(spi, idx, cs[idx]); diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e9ea43234d9a..49c048277e97 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -170,6 +170,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * two delays will be added up. * @chip_select: Array of physical chipselect, spi->chipselect[i] gives * the corresponding physical CS for logical CS i. + * @num_chipselect: Number of physical chipselects used. * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines * (optional, NULL when not using a GPIO line) @@ -229,6 +230,7 @@ struct spi_device { struct spi_delay cs_inactive; u8 chip_select[SPI_CS_CNT_MAX]; + u8 num_chipselect; /* * Bit mask of the chipselect(s) that the driver need to use from @@ -315,7 +317,7 @@ static inline bool spi_is_csgpiod(struct spi_device *spi) { u8 idx; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < spi->num_chipselect; idx++) { if (spi_get_csgpiod(spi, idx)) return true; } -- cgit v1.2.3 From 08fda410bae41cc8dde9697f9104da525be53153 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:24 +0200 Subject: spi: reduce device chip select limit again The spi chipselect limit SPI_CS_CNT_MAX was raised with commit 2f8c7c3715f2 ("spi: Raise limit on number of chip selects") from 4 to 16 to accommodate spi controllers with more than 4 chip selects, and then later to 24 with commit 96893cdd4760 ("spi: Raise limit on number of chip selects to 24"). Now that we removed SPI_CS_CNT_MAX limiting the chip selects of controllers, we can reduce the amount of chip selects per device again to 4, the original value. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-7-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 49c048277e97..df4842abbc6f 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -21,7 +21,7 @@ #include /* Max no. of CS supported per spi device */ -#define SPI_CS_CNT_MAX 24 +#define SPI_CS_CNT_MAX 4 struct dma_chan; struct software_node; -- cgit v1.2.3 From e336ab509b43ea601801dfa05b4270023c3ed007 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 15 Sep 2025 20:37:25 +0200 Subject: spi: rename SPI_CS_CNT_MAX => SPI_DEVICE_CS_CNT_MAX Rename SPI_CS_CNT_MAX to SPI_DEVICE_CS_CNT_MAX to make it more obvious that this is the max number of CS per device supported, not per controller. Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250915183725.219473-8-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 2 +- drivers/spi/spi.c | 14 +++++++------- include/linux/spi/spi.h | 12 ++++++------ 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index af253b86f1ab..2ce4fc136a51 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -33,7 +33,7 @@ #define CQSPI_NAME "cadence-qspi" #define CQSPI_MAX_CHIPSELECT 4 -static_assert(CQSPI_MAX_CHIPSELECT <= SPI_CS_CNT_MAX); +static_assert(CQSPI_MAX_CHIPSELECT <= SPI_DEVICE_CS_CNT_MAX); /* Quirks */ #define CQSPI_NEEDS_WR_DELAY BIT(0) diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 2eb361e9e44d..2e0647a06890 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -670,9 +670,9 @@ static int __spi_add_device(struct spi_device *spi) int status, idx; u8 cs; - if (spi->num_chipselect > SPI_CS_CNT_MAX) { + if (spi->num_chipselect > SPI_DEVICE_CS_CNT_MAX) { dev_err(dev, "num_cs %d > max %d\n", spi->num_chipselect, - SPI_CS_CNT_MAX); + SPI_DEVICE_CS_CNT_MAX); return -EOVERFLOW; } @@ -699,7 +699,7 @@ static int __spi_add_device(struct spi_device *spi) } /* Initialize unused logical CS as invalid */ - for (idx = spi->num_chipselect; idx < SPI_CS_CNT_MAX; idx++) + for (idx = spi->num_chipselect; idx < SPI_DEVICE_CS_CNT_MAX; idx++) spi_set_chipselect(spi, idx, SPI_INVALID_CS); /* Set the bus ID string */ @@ -1076,7 +1076,7 @@ static void spi_set_cs(struct spi_device *spi, bool enable, bool force) trace_spi_set_cs(spi, activate); spi->controller->last_cs_index_mask = spi->cs_index_mask; - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { + for (idx = 0; idx < SPI_DEVICE_CS_CNT_MAX; idx++) { if (enable && idx < spi->num_chipselect) spi->controller->last_cs[idx] = spi_get_chipselect(spi, 0); else @@ -2354,7 +2354,7 @@ static void of_spi_parse_dt_cs_delay(struct device_node *nc, static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, struct device_node *nc) { - u32 value, cs[SPI_CS_CNT_MAX]; + u32 value, cs[SPI_DEVICE_CS_CNT_MAX]; int rc, idx; /* Mode (clock phase/polarity/etc.) */ @@ -2429,7 +2429,7 @@ static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, /* Device address */ rc = of_property_read_variable_u32_array(nc, "reg", &cs[0], 1, - SPI_CS_CNT_MAX); + SPI_DEVICE_CS_CNT_MAX); if (rc < 0) { dev_err(&ctlr->dev, "%pOF has no valid 'reg' property (%d)\n", nc, rc); @@ -3313,7 +3313,7 @@ int spi_register_controller(struct spi_controller *ctlr) } /* Setting last_cs to SPI_INVALID_CS means no chip selected */ - for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) + for (idx = 0; idx < SPI_DEVICE_CS_CNT_MAX; idx++) ctlr->last_cs[idx] = SPI_INVALID_CS; status = device_add(&ctlr->dev); diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index df4842abbc6f..cb2c2df31089 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -21,7 +21,7 @@ #include /* Max no. of CS supported per spi device */ -#define SPI_CS_CNT_MAX 4 +#define SPI_DEVICE_CS_CNT_MAX 4 struct dma_chan; struct software_node; @@ -229,7 +229,7 @@ struct spi_device { struct spi_delay cs_hold; struct spi_delay cs_inactive; - u8 chip_select[SPI_CS_CNT_MAX]; + u8 chip_select[SPI_DEVICE_CS_CNT_MAX]; u8 num_chipselect; /* @@ -238,9 +238,9 @@ struct spi_device { * multiple chip selects & memories are connected in parallel * then more than one bit need to be set in cs_index_mask. */ - u32 cs_index_mask : SPI_CS_CNT_MAX; + u32 cs_index_mask : SPI_DEVICE_CS_CNT_MAX; - struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ + struct gpio_desc *cs_gpiod[SPI_DEVICE_CS_CNT_MAX]; /* Chip select gpio desc */ /* * Likely need more hooks for more protocol options affecting how @@ -721,8 +721,8 @@ struct spi_controller { bool auto_runtime_pm; bool fallback; bool last_cs_mode_high; - s8 last_cs[SPI_CS_CNT_MAX]; - u32 last_cs_index_mask : SPI_CS_CNT_MAX; + s8 last_cs[SPI_DEVICE_CS_CNT_MAX]; + u32 last_cs_index_mask : SPI_DEVICE_CS_CNT_MAX; struct completion xfer_completion; size_t max_dma_len; -- cgit v1.2.3 From 8535bd38b4d58a3d19bf8e7dfa66e1d8180b316a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:35 +0200 Subject: cgroup: add missing ns_common include Add the missing include of the ns_common header. Acked-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/cgroup_namespace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup_namespace.h b/include/linux/cgroup_namespace.h index b7dbf4d623d2..81ccbdee425b 100644 --- a/include/linux/cgroup_namespace.h +++ b/include/linux/cgroup_namespace.h @@ -2,6 +2,8 @@ #ifndef _LINUX_CGROUP_NAMESPACE_H #define _LINUX_CGROUP_NAMESPACE_H +#include + struct cgroup_namespace { struct ns_common ns; struct user_namespace *user_ns; -- cgit v1.2.3 From d7610cb7454bbd8bf6d58f71b0ed57155d3c545f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 22 Sep 2025 14:42:36 +0200 Subject: ns: simplify ns_common_init() further Simply derive the ns operations from the namespace type. Acked-by: Thomas Gleixner Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 4 ++-- include/linux/ns_common.h | 30 ++++++++++++++++++++++++++---- ipc/namespace.c | 2 +- kernel/cgroup/namespace.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/time/namespace.c | 2 +- kernel/user_namespace.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 9 +-------- 9 files changed, 35 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index 271cd6294c8a..d65917ec5544 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4104,9 +4104,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } if (anon) - ret = ns_common_init_inum(new_ns, &mntns_operations, MNT_NS_ANON_INO); + ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); else - ret = ns_common_init(new_ns, &mntns_operations); + ret = ns_common_init(new_ns); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index aea8528d799a..56492cd9ff8d 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -25,6 +25,17 @@ extern struct time_namespace init_time_ns; extern struct user_namespace init_user_ns; extern struct uts_namespace init_uts_ns; +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations utsns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; + struct ns_common { struct dentry *stashed; const struct proc_ns_operations *ops; @@ -84,10 +95,21 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: &init_user_ns, \ struct uts_namespace *: &init_uts_ns) -#define ns_common_init(__ns, __ops) \ - __ns_common_init(to_ns_common(__ns), __ops, (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) - -#define ns_common_init_inum(__ns, __ops, __inum) __ns_common_init(to_ns_common(__ns), __ops, __inum) +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) + +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) diff --git a/ipc/namespace.c b/ipc/namespace.c index bd85d1c9d2c2..d89dfd718d2b 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -62,7 +62,7 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (ns == NULL) goto fail_dec; - err = ns_common_init(ns, &ipcns_operations); + err = ns_common_init(ns); if (err) goto fail_free; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 16ead7508371..04c98338ac08 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -27,7 +27,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = ns_common_init(new_ns, &cgroupns_operations); + ret = ns_common_init(new_ns); if (ret) return ERR_PTR(ret); ns_tree_add(new_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 162f5fb63d75..a262a3f19443 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -103,7 +103,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_common_init(ns, &pidns_operations); + err = ns_common_init(ns); if (err) goto out_free_idr; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 7aa4d6fedd49..9f26e61be044 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -97,7 +97,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns->vvar_page) goto fail_free; - err = ns_common_init(ns, &timens_operations); + err = ns_common_init(ns); if (err) goto fail_free_page; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f9df45c46235..e1559e8a8a02 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -126,7 +126,7 @@ int create_user_ns(struct cred *new) ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); - ret = ns_common_init(ns, &userns_operations); + ret = ns_common_init(ns); if (ret) goto fail_free; diff --git a/kernel/utsname.c b/kernel/utsname.c index 95d733eb2c98..00001592ad13 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -50,7 +50,7 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - err = ns_common_init(ns, &utsns_operations); + err = ns_common_init(ns); if (err) goto fail_free; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index d5e3fd819163..bdea7d5fac56 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -400,16 +400,9 @@ static __net_init void preinit_net_sysctl(struct net *net) /* init code that must occur even if setup_net() is not called. */ static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { - const struct proc_ns_operations *ns_ops; int ret; -#ifdef CONFIG_NET_NS - ns_ops = &netns_operations; -#else - ns_ops = NULL; -#endif - - ret = ns_common_init(net, ns_ops); + ret = ns_common_init(net); if (ret) return ret; -- cgit v1.2.3 From 0950c64ae38661bd97127e9aa0522f1624f82006 Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Mon, 22 Sep 2025 12:26:06 +0000 Subject: workqueue: fix texinfodocs warning for WQ_* flags reference Sphinx emitted a warning during make texinfodocs: WARNING: Inline literal start-string without end-string. This was caused by the trailing '*' in "%WQ_*" being parsed as reStructuredText markup in the kernel-doc comment. Escape the '*' in the comment so that Sphinx treats it as a literal character, resolving the warning. Signed-off-by: Kriish Sharma Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 71a9900c03c7..dabc351cc127 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -502,7 +502,7 @@ void workqueue_softirq_dead(unsigned int cpu); * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means * that the sum of per-node max_active's may be larger than @max_active. * - * For detailed information on %WQ_* flags, please refer to + * For detailed information on %WQ_\* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: -- cgit v1.2.3 From a571f08d3db215dd6ec294d8faac8cc4184bc4e4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 16 Sep 2025 22:46:36 +0100 Subject: net: phy: add phy_interface_copy() Add a helper for copying PHY interface bitmasks. This will be used by the SFP bus code, which will then be moved to phylink in the subsequent patches. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uydVU-000000061W8-2IDT@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7da9e19471c9..d09fc42e61f3 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -169,6 +169,11 @@ static inline bool phy_interface_empty(const unsigned long *intf) return bitmap_empty(intf, PHY_INTERFACE_MODE_MAX); } +static inline void phy_interface_copy(unsigned long *d, const unsigned long *s) +{ + bitmap_copy(d, s, PHY_INTERFACE_MODE_MAX); +} + static inline unsigned int phy_interface_weight(const unsigned long *intf) { return bitmap_weight(intf, PHY_INTERFACE_MODE_MAX); -- cgit v1.2.3 From ddae6127afbba46e32af3b31eb7bba939e1fad96 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 16 Sep 2025 22:46:41 +0100 Subject: net: sfp: pre-parse the module support Pre-parse the module support on insert rather than when the upstream requests the data. This will allow more flexible and extensible parsing. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uydVZ-000000061WE-2pXD@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/sfp-bus.c | 80 ++++++++++++++++++++++++++++++++--------------- include/linux/sfp.h | 22 +++++++++++++ 2 files changed, 77 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index f13c00b5b449..35030c527fbe 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -22,7 +22,6 @@ struct sfp_bus { const struct sfp_socket_ops *socket_ops; struct device *sfp_dev; struct sfp *sfp; - const struct sfp_quirk *sfp_quirk; const struct sfp_upstream_ops *upstream_ops; void *upstream; @@ -30,6 +29,8 @@ struct sfp_bus { bool registered; bool started; + + struct sfp_module_caps caps; }; /** @@ -48,6 +49,13 @@ struct sfp_bus { */ int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support) +{ + return bus->caps.port; +} +EXPORT_SYMBOL_GPL(sfp_parse_port); + +static void sfp_module_parse_port(struct sfp_bus *bus, + const struct sfp_eeprom_id *id) { int port; @@ -91,21 +99,18 @@ int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, break; } - if (support) { - switch (port) { - case PORT_FIBRE: - phylink_set(support, FIBRE); - break; + switch (port) { + case PORT_FIBRE: + phylink_set(bus->caps.link_modes, FIBRE); + break; - case PORT_TP: - phylink_set(support, TP); - break; - } + case PORT_TP: + phylink_set(bus->caps.link_modes, TP); + break; } - return port; + bus->caps.port = port; } -EXPORT_SYMBOL_GPL(sfp_parse_port); /** * sfp_may_have_phy() - indicate whether the module may have a PHY @@ -117,8 +122,17 @@ EXPORT_SYMBOL_GPL(sfp_parse_port); */ bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id) { - if (id->base.e1000_base_t) - return true; + return bus->caps.may_have_phy; +} +EXPORT_SYMBOL_GPL(sfp_may_have_phy); + +static void sfp_module_parse_may_have_phy(struct sfp_bus *bus, + const struct sfp_eeprom_id *id) +{ + if (id->base.e1000_base_t) { + bus->caps.may_have_phy = true; + return; + } if (id->base.phys_id != SFF8024_ID_DWDM_SFP) { switch (id->base.extended_cc) { @@ -126,13 +140,13 @@ bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id) case SFF8024_ECC_10GBASE_T_SR: case SFF8024_ECC_5GBASE_T: case SFF8024_ECC_2_5GBASE_T: - return true; + bus->caps.may_have_phy = true; + return; } } - return false; + bus->caps.may_have_phy = false; } -EXPORT_SYMBOL_GPL(sfp_may_have_phy); /** * sfp_parse_support() - Parse the eeprom id for supported link modes @@ -148,8 +162,17 @@ EXPORT_SYMBOL_GPL(sfp_may_have_phy); void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support, unsigned long *interfaces) { + linkmode_or(support, support, bus->caps.link_modes); + phy_interface_copy(interfaces, bus->caps.interfaces); +} +EXPORT_SYMBOL_GPL(sfp_parse_support); + +static void sfp_module_parse_support(struct sfp_bus *bus, + const struct sfp_eeprom_id *id) +{ + unsigned long *interfaces = bus->caps.interfaces; + unsigned long *modes = bus->caps.link_modes; unsigned int br_min, br_nom, br_max; - __ETHTOOL_DECLARE_LINK_MODE_MASK(modes) = { 0, }; /* Decode the bitrate information to MBd */ br_min = br_nom = br_max = 0; @@ -338,13 +361,22 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, phylink_set(modes, Autoneg); phylink_set(modes, Pause); phylink_set(modes, Asym_Pause); +} + +static void sfp_init_module(struct sfp_bus *bus, + const struct sfp_eeprom_id *id, + const struct sfp_quirk *quirk) +{ + memset(&bus->caps, 0, sizeof(bus->caps)); - if (bus->sfp_quirk && bus->sfp_quirk->modes) - bus->sfp_quirk->modes(id, modes, interfaces); + sfp_module_parse_support(bus, id); + sfp_module_parse_port(bus, id); + sfp_module_parse_may_have_phy(bus, id); - linkmode_or(support, support, modes); + if (quirk && quirk->modes) + quirk->modes(id, bus->caps.link_modes, + bus->caps.interfaces); } -EXPORT_SYMBOL_GPL(sfp_parse_support); /** * sfp_select_interface() - Select appropriate phy_interface_t mode @@ -794,7 +826,7 @@ int sfp_module_insert(struct sfp_bus *bus, const struct sfp_eeprom_id *id, const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus); int ret = 0; - bus->sfp_quirk = quirk; + sfp_init_module(bus, id, quirk); if (ops && ops->module_insert) ret = ops->module_insert(bus->upstream, id); @@ -809,8 +841,6 @@ void sfp_module_remove(struct sfp_bus *bus) if (ops && ops->module_remove) ops->module_remove(bus->upstream); - - bus->sfp_quirk = NULL; } EXPORT_SYMBOL_GPL(sfp_module_remove); diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 60c65cea74f6..5fb59cf49882 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -521,6 +521,28 @@ struct ethtool_eeprom; struct ethtool_modinfo; struct sfp_bus; +/** + * struct sfp_module_caps - sfp module capabilities + * @interfaces: bitmap of interfaces that the module may support + * @link_modes: bitmap of ethtool link modes that the module may support + */ +struct sfp_module_caps { + DECLARE_PHY_INTERFACE_MASK(interfaces); + __ETHTOOL_DECLARE_LINK_MODE_MASK(link_modes); + /** + * @may_have_phy: indicate whether the module may have an ethernet PHY + * There is no way to be sure that a module has a PHY as the EEPROM + * doesn't contain this information. When set, this does not mean that + * the module definitely has a PHY. + */ + bool may_have_phy; + /** + * @port: one of ethtool %PORT_* definitions, parsed from the module + * EEPROM, or %PORT_OTHER if the port type is not known. + */ + u8 port; +}; + /** * struct sfp_upstream_ops - upstream operations structure * @attach: called when the sfp socket driver is bound to the upstream -- cgit v1.2.3 From 64fb4a3ae8a5d9c4d27d9f4ae58e38200fc3d44b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 16 Sep 2025 22:46:51 +0100 Subject: net: sfp: provide sfp_get_module_caps() Provide a function to retrieve the current sfp_module_caps structure so that upstreams can get the entire module support in one go. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uydVj-000000061WQ-3q47@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/sfp-bus.c | 6 ++++++ include/linux/sfp.h | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index b77190494b04..e8cf411396fa 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -33,6 +33,12 @@ struct sfp_bus { struct sfp_module_caps caps; }; +const struct sfp_module_caps *sfp_get_module_caps(struct sfp_bus *bus) +{ + return &bus->caps; +} +EXPORT_SYMBOL_GPL(sfp_get_module_caps); + /** * sfp_parse_port() - Parse the EEPROM base ID, setting the port type * @bus: a pointer to the &struct sfp_bus structure for the sfp module diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 5fb59cf49882..9f29fcad52be 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -576,6 +576,7 @@ struct sfp_upstream_ops { }; #if IS_ENABLED(CONFIG_SFP) +const struct sfp_module_caps *sfp_get_module_caps(struct sfp_bus *bus); int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support); bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id); @@ -600,6 +601,12 @@ int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, void sfp_bus_del_upstream(struct sfp_bus *bus); const char *sfp_get_name(struct sfp_bus *bus); #else +static inline const struct sfp_module_caps * +sfp_get_module_caps(struct sfp_bus *bus) +{ + return NULL; +} + static inline int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support) -- cgit v1.2.3 From 9ce138735efcb395974952972aa5dbd1d444ac2c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 16 Sep 2025 22:47:07 +0100 Subject: net: sfp: remove old sfp_parse_* functions Remove the old sfp_parse_*() functions that are now no longer used. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1uydVz-000000061Wj-13Yd@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/sfp-bus.c | 54 ----------------------------------------------- include/linux/sfp.h | 25 ---------------------- 2 files changed, 79 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c index e8cf411396fa..b945d75966d5 100644 --- a/drivers/net/phy/sfp-bus.c +++ b/drivers/net/phy/sfp-bus.c @@ -39,27 +39,6 @@ const struct sfp_module_caps *sfp_get_module_caps(struct sfp_bus *bus) } EXPORT_SYMBOL_GPL(sfp_get_module_caps); -/** - * sfp_parse_port() - Parse the EEPROM base ID, setting the port type - * @bus: a pointer to the &struct sfp_bus structure for the sfp module - * @id: a pointer to the module's &struct sfp_eeprom_id - * @support: optional pointer to an array of unsigned long for the - * ethtool support mask - * - * Parse the EEPROM identification given in @id, and return one of - * %PORT_TP, %PORT_FIBRE or %PORT_OTHER. If @support is non-%NULL, - * also set the ethtool %ETHTOOL_LINK_MODE_xxx_BIT corresponding with - * the connector type. - * - * If the port type is not known, returns %PORT_OTHER. - */ -int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support) -{ - return bus->caps.port; -} -EXPORT_SYMBOL_GPL(sfp_parse_port); - static void sfp_module_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id) { @@ -118,20 +97,6 @@ static void sfp_module_parse_port(struct sfp_bus *bus, bus->caps.port = port; } -/** - * sfp_may_have_phy() - indicate whether the module may have a PHY - * @bus: a pointer to the &struct sfp_bus structure for the sfp module - * @id: a pointer to the module's &struct sfp_eeprom_id - * - * Parse the EEPROM identification given in @id, and return whether - * this module may have a PHY. - */ -bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id) -{ - return bus->caps.may_have_phy; -} -EXPORT_SYMBOL_GPL(sfp_may_have_phy); - static void sfp_module_parse_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id) { @@ -154,25 +119,6 @@ static void sfp_module_parse_may_have_phy(struct sfp_bus *bus, bus->caps.may_have_phy = false; } -/** - * sfp_parse_support() - Parse the eeprom id for supported link modes - * @bus: a pointer to the &struct sfp_bus structure for the sfp module - * @id: a pointer to the module's &struct sfp_eeprom_id - * @support: pointer to an array of unsigned long for the ethtool support mask - * @interfaces: pointer to an array of unsigned long for phy interface modes - * mask - * - * Parse the EEPROM identification information and derive the supported - * ethtool link modes for the module. - */ -void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support, unsigned long *interfaces) -{ - linkmode_or(support, support, bus->caps.link_modes); - phy_interface_copy(interfaces, bus->caps.interfaces); -} -EXPORT_SYMBOL_GPL(sfp_parse_support); - static void sfp_module_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id) { diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 9f29fcad52be..5c71945a5e4d 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -577,11 +577,6 @@ struct sfp_upstream_ops { #if IS_ENABLED(CONFIG_SFP) const struct sfp_module_caps *sfp_get_module_caps(struct sfp_bus *bus); -int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support); -bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id); -void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support, unsigned long *interfaces); phy_interface_t sfp_select_interface(struct sfp_bus *bus, const unsigned long *link_modes); @@ -607,26 +602,6 @@ sfp_get_module_caps(struct sfp_bus *bus) return NULL; } -static inline int sfp_parse_port(struct sfp_bus *bus, - const struct sfp_eeprom_id *id, - unsigned long *support) -{ - return PORT_OTHER; -} - -static inline bool sfp_may_have_phy(struct sfp_bus *bus, - const struct sfp_eeprom_id *id) -{ - return false; -} - -static inline void sfp_parse_support(struct sfp_bus *bus, - const struct sfp_eeprom_id *id, - unsigned long *support, - unsigned long *interfaces) -{ -} - static inline phy_interface_t sfp_select_interface(struct sfp_bus *bus, const unsigned long *link_modes) { -- cgit v1.2.3 From 1b44d700023e77dd92821e7811db825e75a1a394 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:52 +0000 Subject: tcp: move tcp->rcv_tstamp to tcp_sock_write_txrx group tcp_ack() writes this field, it belongs to tcp_sock_write_txrx. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/tcp_sock.rst | 2 +- include/linux/tcp.h | 4 ++-- net/ipv4/tcp.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index d4dc01800945..429df29fba8b 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -26,7 +26,7 @@ u64 bytes_acked read_w u32 dsack_dups u32 snd_una read_mostly read_write tcp_wnd_end,tcp_urg_mode,tcp_minshall_check,tcp_cwnd_validate(tx);tcp_ack,tcp_may_update_window,tcp_clean_rtx_queue(write),tcp_ack_tstamp(rx) u32 snd_sml read_write tcp_minshall_check,tcp_minshall_update -u32 rcv_tstamp read_mostly tcp_ack +u32 rcv_tstamp read_write read_write tcp_ack void * tcp_clean_acked read_mostly tcp_ack u32 lsndtime read_write tcp_slow_start_after_idle_check,tcp_event_data_sent u32 last_oow_ack_time diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3ca5ed02de6d..1e6c2ded22c9 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -238,7 +238,6 @@ struct tcp_sock { /* RX read-mostly hotpath cache lines */ __cacheline_group_begin(tcp_sock_read_rx); u32 copied_seq; /* Head of yet unread data */ - u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 snd_wl1; /* Sequence for window update */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 rttvar_us; /* smoothed mdev_max */ @@ -246,13 +245,13 @@ struct tcp_sock { u16 advmss; /* Advertised MSS */ u16 urg_data; /* Saved octet of OOB data and control flags */ u32 lost; /* Total data packets lost incl. rexmits */ + u32 snd_ssthresh; /* Slow start size threshold */ struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; #if defined(CONFIG_TLS_DEVICE) void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); #endif - u32 snd_ssthresh; /* Slow start size threshold */ u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_rx); @@ -319,6 +318,7 @@ struct tcp_sock { */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ + u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ /* * Options received (usually on last packet, some only on SYN packets). */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5932dba3bd71..721287ca3328 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5114,7 +5114,6 @@ static void __init tcp_struct_check(void) /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); @@ -5164,6 +5163,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); /* RX read-write hotpath cache lines */ -- cgit v1.2.3 From 969904dcd77dbb0a773d66cddaa59eccc6415d03 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:53 +0000 Subject: tcp: move recvmsg_inq to tcp_sock_read_txrx Fill a hole in tcp_sock_read_txrx, instead of possibly wasting a cache line. Note that tcp_recvmsg_locked() is also reading tp->repair, so this removes one cache line miss in tcp recvmsg(). Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/tcp_sock.rst | 2 +- include/linux/tcp.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 429df29fba8b..c2138619b995 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -57,7 +57,7 @@ u8:1 is_sack_reneg read_m u8:2 fastopen_client_fail u8:4 nonagle read_write tcp_skb_entail,tcp_push_pending_frames u8:1 thin_lto -u8:1 recvmsg_inq +u8:1 recvmsg_inq read_mostly tcp_recvmsg u8:1 repair read_mostly tcp_write_xmit u8:1 frto u8 repair_queue diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1e6c2ded22c9..c1d7fce251d7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -232,7 +232,8 @@ struct tcp_sock { repair : 1, tcp_usec_ts : 1, /* TSval values in usec */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ - is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ + is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ + recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_txrx); /* RX read-mostly hotpath cache lines */ @@ -252,7 +253,6 @@ struct tcp_sock { #if defined(CONFIG_TLS_DEVICE) void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); #endif - u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ __cacheline_group_end(tcp_sock_read_rx); /* TX read-write hotpath cache lines */ -- cgit v1.2.3 From a105ea47a4e855d24ebf65f1c5fb907162e7b8cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:54 +0000 Subject: tcp: move tcp_clean_acked to tcp_sock_read_tx group tp->tcp_clean_acked is fetched in tx path when snd_una is updated. This field thus belongs to tcp_sock_read_tx group. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/tcp_sock.rst | 2 +- include/linux/tcp.h | 6 +++--- net/ipv4/tcp.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index c2138619b995..26f32dbcf6ec 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -27,7 +27,7 @@ u32 dsack_dups u32 snd_una read_mostly read_write tcp_wnd_end,tcp_urg_mode,tcp_minshall_check,tcp_cwnd_validate(tx);tcp_ack,tcp_may_update_window,tcp_clean_rtx_queue(write),tcp_ack_tstamp(rx) u32 snd_sml read_write tcp_minshall_check,tcp_minshall_update u32 rcv_tstamp read_write read_write tcp_ack -void * tcp_clean_acked read_mostly tcp_ack +void * tcp_clean_acked read_mostly tcp_ack u32 lsndtime read_write tcp_slow_start_after_idle_check,tcp_event_data_sent u32 last_oow_ack_time u32 compressed_ack_rcv_nxt diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c1d7fce251d7..3f282130c863 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -215,6 +215,9 @@ struct tcp_sock { u16 gso_segs; /* Max number of segs per GSO packet */ /* from STCP, retrans queue hinting */ struct sk_buff *retransmit_skb_hint; +#if defined(CONFIG_TLS_DEVICE) + void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); +#endif __cacheline_group_end(tcp_sock_read_tx); /* TXRX read-mostly hotpath cache lines */ @@ -250,9 +253,6 @@ struct tcp_sock { struct minmax rtt_min; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; -#if defined(CONFIG_TLS_DEVICE) - void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); -#endif __cacheline_group_end(tcp_sock_read_rx); /* TX read-write hotpath cache lines */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 721287ca3328..7949d16506a4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5101,6 +5101,9 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); +#if IS_ENABLED(CONFIG_TLS_DEVICE) + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked); +#endif /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); @@ -5124,9 +5127,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); -#if IS_ENABLED(CONFIG_TLS_DEVICE) - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked); -#endif /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); -- cgit v1.2.3 From 31c4511bbb0c75c525b6e4f4fe4167f2e9d3b05c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Sep 2025 20:48:55 +0000 Subject: tcp: move mtu_info to remove two 32bit holes This removes 8bytes waste on 64bit builds. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250919204856.2977245-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3f282130c863..20b8c6e21fef 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -448,6 +448,9 @@ struct tcp_sock { * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ + u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG + * while socket was owned by user. + */ u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans * Total data bytes retransmitted */ @@ -494,9 +497,6 @@ struct tcp_sock { u32 probe_seq_end; } mtu_probe; u32 plb_rehash; /* PLB-triggered rehash attempts */ - u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG - * while socket was owned by user. - */ #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif -- cgit v1.2.3 From 349271568303695f0ac3563af153d2b4542f6986 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 21 Sep 2025 18:01:16 +0200 Subject: bpf: Implement signature verification for BPF programs This patch extends the BPF_PROG_LOAD command by adding three new fields to `union bpf_attr` in the user-space API: - signature: A pointer to the signature blob. - signature_size: The size of the signature blob. - keyring_id: The serial number of a loaded kernel keyring (e.g., the user or session keyring) containing the trusted public keys. When a BPF program is loaded with a signature, the kernel: 1. Retrieves the trusted keyring using the provided `keyring_id`. 2. Verifies the supplied signature against the BPF program's instruction buffer. 3. If the signature is valid and was generated by a key in the trusted keyring, the program load proceeds. 4. If no signature is provided, the load proceeds as before, allowing for backward compatibility. LSMs can chose to restrict unsigned programs and implement a security policy. 5. If signature verification fails for any reason, the program is not loaded. Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250921160120.9711-2-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- crypto/asymmetric_keys/pkcs7_verify.c | 1 + include/linux/verification.h | 1 + include/uapi/linux/bpf.h | 10 ++++++++ kernel/bpf/helpers.c | 2 +- kernel/bpf/syscall.c | 45 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 10 ++++++++ tools/lib/bpf/bpf.c | 2 +- 7 files changed, 68 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index f0d4ff3c20a8..6d6475e3a9bf 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -429,6 +429,7 @@ int pkcs7_verify(struct pkcs7_message *pkcs7, /* Authattr presence checked in parser */ break; case VERIFYING_UNSPECIFIED_SIGNATURE: + case VERIFYING_BPF_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid unspecified sig (not pkcs7-data)\n"); return -EKEYREJECTED; diff --git a/include/linux/verification.h b/include/linux/verification.h index 4f3022d081c3..dec7f2beabfd 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -36,6 +36,7 @@ enum key_being_used_for { VERIFYING_KEY_SIGNATURE, VERIFYING_KEY_SELF_SIGNATURE, VERIFYING_UNSPECIFIED_SIGNATURE, + VERIFYING_BPF_SIGNATURE, NR__KEY_BEING_USED_FOR }; #ifdef CONFIG_SYSTEM_DATA_VERIFICATION diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0987b52d5648..f3b173e48b0f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1611,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ef4ede8bb74f..969f63f8ca28 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3898,7 +3898,7 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + VERIFYING_BPF_SIGNATURE, NULL, NULL); #else return -EOPNOTSUPP; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf7173b1bb83..8a3c3d26f6e2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2785,8 +2786,44 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, + bool is_kernel) +{ + bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); + struct bpf_dynptr_kern sig_ptr, insns_ptr; + struct bpf_key *key = NULL; + void *sig; + int err = 0; + + if (system_keyring_id_check(attr->keyring_id) == 0) + key = bpf_lookup_system_key(attr->keyring_id); + else + key = bpf_lookup_user_key(attr->keyring_id, 0); + + if (!key) + return -EINVAL; + + sig = kvmemdup_bpfptr(usig, attr->signature_size); + if (IS_ERR(sig)) { + bpf_key_put(key); + return -ENOMEM; + } + + bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, + attr->signature_size); + bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0, + prog->len * sizeof(struct bpf_insn)); + + err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, + (struct bpf_dynptr *)&sig_ptr, key); + + bpf_key_put(key); + kvfree(sig); + return err; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt +#define BPF_PROG_LOAD_LAST_FIELD keyring_id static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { @@ -2950,6 +2987,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; + if (attr->signature) { + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + if (err) + goto free_prog; + } + prog->orig_prog = NULL; prog->jited = 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0987b52d5648..f3b173e48b0f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1611,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 19ad7bcf0c2f..339b19797237 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -240,7 +240,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insn_cnt, struct bpf_prog_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, fd_array_cnt); + const size_t attr_sz = offsetofend(union bpf_attr, keyring_id); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; -- cgit v1.2.3 From 39f17c707454290900b608ee5a200b5db9245626 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 14 Sep 2025 13:09:08 +0200 Subject: sched/task.h: fix the wrong comment on task_lock() nesting with tasklist_lock The ancient comment above task_lock() states that it can be nested outside of read_lock(&tasklist_lock), but this is no longer true: CPU_0 CPU_1 CPU_2 task_lock() read_lock(tasklist) write_lock_irq(tasklist) read_lock(tasklist) task_lock() Unless CPU_0 calls read_lock() in IRQ context, queued_read_lock_slowpath() won't get the lock immediately, it will spin waiting for the pending writer on CPU_2, resulting in a deadlock. Link: https://lkml.kernel.org/r/20250914110908.GA18769@redhat.com Signed-off-by: Oleg Nesterov Cc: Christian Brauner Cc: Jiri Slaby Cc: Mateusz Guzik Signed-off-by: Andrew Morton --- include/linux/sched/task.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ea41795a352b..8ff98b18b24b 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -210,9 +210,8 @@ static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * - * Nests both inside and outside of read_lock(&tasklist_lock). - * It must not be nested with write_lock_irq(&tasklist_lock), - * neither inside nor outside. + * Nests inside of read_lock(&tasklist_lock). It must not be nested with + * write_lock_irq(&tasklist_lock), neither inside nor outside. */ static inline void task_lock(struct task_struct *p) { -- cgit v1.2.3 From af6703838ecb1513efdd2502a8f7bb6472c5ce96 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 3 Sep 2025 18:48:41 +0100 Subject: mm: specify separate file and vm_file params in vm_area_desc Patch series "mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare()", v2. As part of the efforts to eliminate the problematic f_op->mmap callback, a new callback - f_op->mmap_prepare was provided. While we are converting these callbacks, we must deal with 'stacked' filesystems and drivers - those which in their own f_op->mmap callback invoke an inner f_op->mmap callback. To accomodate for this, a compatibility layer is provided that, via vfs_mmap(), detects if f_op->mmap_prepare is provided and if so, generates a vm_area_desc containing the VMA's metadata and invokes the call. So far, we have provided desc->file equal to vma->vm_file. However this is not necessarily valid, especially in the case of stacked drivers which wish to assign a new file after the inner hook is invoked. To account for this, we adjust vm_area_desc to have both file and vm_file fields. The .vm_file field is strictly set to vma->vm_file (or in the case of a new mapping, what will become vma->vm_file). However, .file is set to whichever file vfs_mmap() is invoked with when using the compatibilty layer. Therefore, if the VMA's file needs to be updated in .mmap_prepare, desc->vm_file should be assigned, whilst desc->file should be read. No current f_op->mmap_prepare users assign desc->file so this is safe to do. This makes the .mmap_prepare callback in the context of a stacked filesystem or driver completely consistent with the existing .mmap implementations. While we're here, we do a few small cleanups, and ensure that we const-ify things correctly in the vm_area_desc struct to avoid hooks accidentally trying to assign fields they should not. This patch (of 2): Stacked filesystems and drivers may invoke mmap hooks with a struct file pointer that differs from the overlying file. We will make this functionality possible in a subsequent patch. In order to prepare for this, let's update vm_area_struct to separately provide desc->file and desc->vm_file parameters. The desc->file parameter is the file that the hook is expected to operate upon, and is not assignable (though the hok may wish to e.g. update the file's accessed time for instance). The desc->vm_file defaults to what will become vma->vm_file and is what the hook must reassign should it wish to change the VMA"s vma->vm_file. For now we keep desc->file, vm_file the same to remain consistent. No f_op->mmap_prepare() callback sets a new vma->vm_file currently, so this is safe to change. While we're here, make the mm_struct desc->mm pointers at immutable as well as the desc->mm field itself. As part of this change, also update the single hook which this would otherwise break - mlock_future_ok(), invoked by secretmem_mmap_prepare()). We additionally update set_vma_from_desc() to compare fields in a more logical fashion, checking the (possibly) user-modified fields as the first operand against the existing value as the second one. Additionally, update VMA tests to accommodate changes. Link: https://lkml.kernel.org/r/cover.1756920635.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/3fa15a861bb7419f033d22970598aa61850ea267.1756920635.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++-- mm/internal.h | 4 ++-- mm/mmap.c | 2 +- mm/util.c | 14 ++++++++++++-- mm/vma.c | 5 +++-- mm/vma.h | 28 ++++------------------------ tools/testing/vma/vma_internal.h | 28 ++++++++++++++++++---------- 7 files changed, 43 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6920c816f6c6..965dedb3ccfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -777,13 +777,14 @@ struct pfnmap_track_ctx { */ struct vm_area_desc { /* Immutable state. */ - struct mm_struct *mm; + const struct mm_struct *const mm; + struct file *const file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; - struct file *file; + struct file *vm_file; vm_flags_t vm_flags; pgprot_t page_prot; diff --git a/mm/internal.h b/mm/internal.h index c4657ffd342e..63e3ec8d63be 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -962,8 +962,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); -extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, - unsigned long bytes); +bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, + unsigned long bytes); /* * NOTE: This function can't tell whether the folio is "fully mapped" in the diff --git a/mm/mmap.c b/mm/mmap.c index 7a057e0e8da9..5fd3b80fda1d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -225,7 +225,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, +bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; diff --git a/mm/util.c b/mm/util.c index 732a2dfcaec7..215ecd0214b7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1161,10 +1161,20 @@ EXPORT_SYMBOL(flush_dcache_folio); */ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc; + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = vma->vm_file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + }; int err; - err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + err = file->f_op->mmap_prepare(&desc); if (err) return err; set_vma_from_desc(vma, &desc); diff --git a/mm/vma.c b/mm/vma.c index 3b12c7579831..abe0da33c844 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2572,11 +2572,12 @@ static int call_mmap_prepare(struct mmap_state *map) int err; struct vm_area_desc desc = { .mm = map->mm, + .file = map->file, .start = map->addr, .end = map->end, .pgoff = map->pgoff, - .file = map->file, + .vm_file = map->file, .vm_flags = map->vm_flags, .page_prot = map->page_prot, }; @@ -2588,7 +2589,7 @@ static int call_mmap_prepare(struct mmap_state *map) /* Update fields permitted to be changed. */ map->pgoff = desc.pgoff; - map->file = desc.file; + map->file = desc.vm_file; map->vm_flags = desc.vm_flags; map->page_prot = desc.page_prot; /* User-defined fields. */ diff --git a/mm/vma.h b/mm/vma.h index bcdc261c5b15..9183fe549009 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -222,31 +222,11 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } - /* - * Temporary helper functions for file systems which wrap an invocation of + * Temporary helper function for stacked mmap handlers which specify * f_op->mmap() but which might have an underlying file system which implements * f_op->mmap_prepare(). */ - -static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, - struct vm_area_desc *desc) -{ - desc->mm = vma->vm_mm; - desc->start = vma->vm_start; - desc->end = vma->vm_end; - - desc->pgoff = vma->vm_pgoff; - desc->file = vma->vm_file; - desc->vm_flags = vma->vm_flags; - desc->page_prot = vma->vm_page_prot; - - desc->vm_ops = NULL; - desc->private_data = NULL; - - return desc; -} - static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { @@ -258,9 +238,9 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma, /* Mutable fields. Populated with initial state. */ vma->vm_pgoff = desc->pgoff; - if (vma->vm_file != desc->file) - vma_set_file(vma, desc->file); - if (vma->vm_flags != desc->vm_flags) + if (desc->vm_file != vma->vm_file) + vma_set_file(vma, desc->vm_file); + if (desc->vm_flags != vma->vm_flags) vm_flags_set(vma, desc->vm_flags); vma->vm_page_prot = desc->page_prot; diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 437d2a1013be..6dcbeaa9f9a0 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -283,13 +283,14 @@ struct vm_area_struct; */ struct vm_area_desc { /* Immutable state. */ - struct mm_struct *mm; + const struct mm_struct *const mm; + struct file *const file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; - struct file *file; + struct file *vm_file; vm_flags_t vm_flags; pgprot_t page_prot; @@ -1299,8 +1300,8 @@ static inline bool capable(int cap) return true; } -static inline bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, - unsigned long bytes) +static inline bool mlock_future_ok(const struct mm_struct *mm, + vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; @@ -1465,16 +1466,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, - struct vm_area_desc *desc); - -static int compat_vma_mmap_prepare(struct file *file, +static inline int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc; + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = vma->vm_file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + }; int err; - err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + err = file->f_op->mmap_prepare(&desc); if (err) return err; set_vma_from_desc(vma, &desc); -- cgit v1.2.3 From f7a741c53b712542aedd9382f215fbe969f8a580 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 3 Sep 2025 18:48:42 +0100 Subject: mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare() In commit bb666b7c2707 ("mm: add mmap_prepare() compatibility layer for nested file systems") we introduced the ability for stacked drivers and file systems to correctly invoke the f_op->mmap_prepare() handler from an f_op->mmap() handler via a compatibility layer implemented in compat_vma_mmap_prepare(). This populates vm_area_desc fields according to those found in the (not yet fully initialised) VMA passed to f_op->mmap(). However this function implicitly assumes that the struct file which we are operating upon is equal to vma->vm_file. This is not a safe assumption in all cases. The only really sane situation in which this matters would be something like e.g. i915_gem_dmabuf_mmap() which invokes vfs_mmap() against obj->base.filp: ret = vfs_mmap(obj->base.filp, vma); if (ret) return ret; And then sets the VMA's file to this, should the mmap operation succeed: vma_set_file(vma, obj->base.filp); That is - it is the file that is intended to back the VMA mapping. This is not an issue currently, as so far we have only implemented f_op->mmap_prepare() handlers for some file systems and internal mm uses, and the only stacked f_op->mmap() operations that can be performed upon these are those in backing_file_mmap() and coda_file_mmap(), both of which use vma->vm_file. However, moving forward, as we convert drivers to using f_op->mmap_prepare(), this will become a problem. Resolve this issue by explicitly setting desc->file to the provided file parameter and update callers accordingly. Callers are expected to read desc->file and update desc->vm_file - the former will be the file provided by the caller (if stacked, this may differ from vma->vm_file). If the caller needs to differentiate between the two they therefore now can. While we are here, also provide a variant of compat_vma_mmap_prepare() that operates against a pointer to any file_operations struct and does not assume that the file_operations struct we are interested in is file->f_op. This function is __compat_vma_mmap_prepare() and we invoke it from compat_vma_mmap_prepare() so that we share code between the two functions. This is important, because some drivers provide hooks in a separate struct, for instance struct drm_device provides an fops field for this purpose. Also update the VMA selftests accordingly. Link: https://lkml.kernel.org/r/dd0c72df8a33e8ffaa243eeb9b01010b670610e9.1756920635.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Christian Brauner Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Cc: Al Viro Cc: David Hildenbrand Cc: Jan Kara Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 ++ mm/util.c | 62 +++++++++++++++++++++++++--------------- tools/testing/vma/vma_internal.h | 12 ++++++-- 3 files changed, 50 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0783c5d05d3f..594bd4d0521e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2279,6 +2279,8 @@ static inline bool can_mmap_file(struct file *file) return true; } +int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma); int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/mm/util.c b/mm/util.c index 215ecd0214b7..6c1d64ed0221 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1133,17 +1133,51 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif +/** + * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() + * for details. This is the same operation, only with a specific file operations + * struct which may or may not be the same as vma->vm_file->f_op. + * @f_op: The file operations whose .mmap_prepare() hook is specified. + * @file: The file which backs or will back the mapping. + * @vma: The VMA to apply the .mmap_prepare() hook to. + * Returns: 0 on success or error. + */ +int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + }; + int err; + + err = f_op->mmap_prepare(&desc); + if (err) + return err; + set_vma_from_desc(vma, &desc); + + return 0; +} +EXPORT_SYMBOL(__compat_vma_mmap_prepare); + /** * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an - * existing VMA - * @file: The file which possesss an f_op->mmap_prepare() hook + * existing VMA. + * @file: The file which possesss an f_op->mmap_prepare() hook. * @vma: The VMA to apply the .mmap_prepare() hook to. * * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain - * 'wrapper' file systems invoke a nested mmap hook of an underlying file. + * stacked filesystems invoke a nested mmap hook of an underlying file. * * Until all filesystems are converted to use .mmap_prepare(), we must be - * conservative and continue to invoke these 'wrapper' filesystems using the + * conservative and continue to invoke these stacked filesystems using the * deprecated .mmap() hook. * * However we have a problem if the underlying file system possesses an @@ -1161,25 +1195,7 @@ EXPORT_SYMBOL(flush_dcache_folio); */ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = vma->vm_file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vm_flags = vma->vm_flags, - .page_prot = vma->vm_page_prot, - }; - int err; - - err = file->f_op->mmap_prepare(&desc); - if (err) - return err; - set_vma_from_desc(vma, &desc); - - return 0; + return __compat_vma_mmap_prepare(file->f_op, file, vma); } EXPORT_SYMBOL(compat_vma_mmap_prepare); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 6dcbeaa9f9a0..07167446dcf4 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1466,8 +1466,8 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int compat_vma_mmap_prepare(struct file *file, - struct vm_area_struct *vma) +static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, @@ -1482,7 +1482,7 @@ static inline int compat_vma_mmap_prepare(struct file *file, }; int err; - err = file->f_op->mmap_prepare(&desc); + err = f_op->mmap_prepare(&desc); if (err) return err; set_vma_from_desc(vma, &desc); @@ -1490,6 +1490,12 @@ static inline int compat_vma_mmap_prepare(struct file *file, return 0; } +static inline int compat_vma_mmap_prepare(struct file *file, + struct vm_area_struct *vma) +{ + return __compat_vma_mmap_prepare(file->f_op, file, vma); +} + /* Did the driver provide valid mmap hook configuration? */ static inline bool can_mmap_file(struct file *file) { -- cgit v1.2.3 From ef9f603fd3d4b7937f2cdbce40e47df0a54b2a55 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 22 Sep 2025 11:02:31 -0600 Subject: io_uring/cmd: drop unused res2 param from io_uring_cmd_done() Commit 79525b51acc1 ("io_uring: fix nvme's 32b cqes on mixed cq") split out a separate io_uring_cmd_done32() helper for ->uring_cmd() implementations that return 32-byte CQEs. The res2 value passed to io_uring_cmd_done() is now unused because __io_uring_cmd_done() ignores it when is_cqe32 is passed as false. So drop the parameter from io_uring_cmd_done() to simplify the callers and clarify that it's not possible to return an extra value beyond the 32-bit CQE result. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- block/ioctl.c | 2 +- drivers/block/ublk_drv.c | 6 +++--- fs/btrfs/ioctl.c | 2 +- fs/fuse/dev_uring.c | 8 ++++---- include/linux/io_uring/cmd.h | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/block/ioctl.c b/block/ioctl.c index f7b0006ca45d..c9ea8e53871e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -776,7 +776,7 @@ static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) if (bic->res == -EAGAIN && bic->nowait) io_uring_cmd_issue_blocking(cmd); else - io_uring_cmd_done(cmd, bic->res, 0, issue_flags); + io_uring_cmd_done(cmd, bic->res, issue_flags); } static void bio_cmd_bio_end_io(struct bio *bio) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 99abd67b708b..48c409d1e1bb 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1188,7 +1188,7 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req, struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); /* tell ublksrv one io request is coming */ - io_uring_cmd_done(cmd, res, 0, issue_flags); + io_uring_cmd_done(cmd, res, issue_flags); } #define UBLK_REQUEUE_DELAY_MS 3 @@ -1805,7 +1805,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, spin_unlock(&ubq->cancel_lock); if (!done) - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags); + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); } /* @@ -2452,7 +2452,7 @@ static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); if (ret != -EIOCBQUEUED) - io_uring_cmd_done(cmd, ret, 0, issue_flags); + io_uring_cmd_done(cmd, ret, issue_flags); } static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e13de2bdcbf..168d84421a78 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4685,7 +4685,7 @@ out: btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - io_uring_cmd_done(cmd, ret, 0, issue_flags); + io_uring_cmd_done(cmd, ret, issue_flags); add_rchar(current, ret); for (index = 0; index < priv->nr_pages; index++) diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 249b210becb1..a30c44234a4e 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -351,7 +351,7 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) spin_unlock(&queue->lock); if (cmd) - io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED); if (req) fuse_uring_stop_fuse_req_end(req); @@ -518,7 +518,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, if (need_cmd_done) { /* no queue lock to avoid lock order issues */ - io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); + io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); } } @@ -733,7 +733,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, list_move_tail(&ent->list, &queue->ent_in_userspace); spin_unlock(&queue->lock); - io_uring_cmd_done(cmd, 0, 0, issue_flags); + io_uring_cmd_done(cmd, 0, issue_flags); return 0; } @@ -1200,7 +1200,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, ent->cmd = NULL; spin_unlock(&queue->lock); - io_uring_cmd_done(cmd, ret, 0, issue_flags); + io_uring_cmd_done(cmd, ret, issue_flags); } /* diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 02d50f08f668..7509025b4071 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -160,9 +160,9 @@ static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) } static inline void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, - u64 res2, unsigned issue_flags) + unsigned issue_flags) { - return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, false); + return __io_uring_cmd_done(ioucmd, ret, 0, issue_flags, false); } static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, -- cgit v1.2.3 From 7c7da8aa3fd69b77e8efaa14b80bddd948547739 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:37:09 +0900 Subject: can: dev: turn can_set_static_ctrlmode() into a non-inline function can_set_static_ctrlmode() is declared as a static inline. But it is only called in the probe function of the devices and so does not really benefit from any kind of optimization. Transform it into a "normal" function by moving it to drivers/net/can/dev/dev.c Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-can-fix-mtu-v3-2-581bde113f52@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 21 +++++++++++++++++++++ include/linux/can/dev.h | 23 ++--------------------- 2 files changed, 23 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 99b78cbb2252..02bfed37cc93 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -347,6 +347,27 @@ int can_change_mtu(struct net_device *dev, int new_mtu) } EXPORT_SYMBOL_GPL(can_change_mtu); +/* helper to define static CAN controller features at device creation time */ +int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) +{ + struct can_priv *priv = netdev_priv(dev); + + /* alloc_candev() succeeded => netdev_priv() is valid at this point */ + if (priv->ctrlmode_supported & static_mode) { + netdev_warn(dev, + "Controller features can not be supported and static at the same time\n"); + return -EINVAL; + } + priv->ctrlmode = static_mode; + + /* override MTU which was set by default in can_setup()? */ + if (static_mode & CAN_CTRLMODE_FD) + dev->mtu = CANFD_MTU; + + return 0; +} +EXPORT_SYMBOL_GPL(can_set_static_ctrlmode); + /* generic implementation of netdev_ops::ndo_eth_ioctl for CAN devices * supporting hardware timestamps */ diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 9a92cbe5b2cb..5dc58360c2d7 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -125,27 +125,6 @@ static inline s32 can_get_relative_tdco(const struct can_priv *priv) return (s32)priv->fd.tdc.tdco - sample_point_in_tc; } -/* helper to define static CAN controller features at device creation time */ -static inline int __must_check can_set_static_ctrlmode(struct net_device *dev, - u32 static_mode) -{ - struct can_priv *priv = netdev_priv(dev); - - /* alloc_candev() succeeded => netdev_priv() is valid at this point */ - if (priv->ctrlmode_supported & static_mode) { - netdev_warn(dev, - "Controller features can not be supported and static at the same time\n"); - return -EINVAL; - } - priv->ctrlmode = static_mode; - - /* override MTU which was set by default in can_setup()? */ - if (static_mode & CAN_CTRLMODE_FD) - dev->mtu = CANFD_MTU; - - return 0; -} - static inline u32 can_get_static_ctrlmode(struct can_priv *priv) { return priv->ctrlmode & ~priv->ctrlmode_supported; @@ -188,6 +167,8 @@ struct can_priv *safe_candev_priv(struct net_device *dev); int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); int can_change_mtu(struct net_device *dev, int new_mtu); +int __must_check can_set_static_ctrlmode(struct net_device *dev, + u32 static_mode); int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd); int can_ethtool_op_get_ts_info_hwts(struct net_device *dev, struct kernel_ethtool_ts_info *info); -- cgit v1.2.3 From 91daac8a6893c65e18f194946ad3ad9df5e9de8d Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Tue, 16 Sep 2025 08:10:07 +0200 Subject: genirq/msi: Remove msi_post_free() The only user of msi_post_free() - powerpc/pseries - has been changed to use msi_teardown(). Remove this unused callback. Signed-off-by: Nam Cao Reviewed-by: Thomas Gleixner Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250916061007.964005-1-namcao@linutronix.de --- include/linux/msi.h | 4 ---- kernel/irq/msi.c | 3 --- 2 files changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index e5e86a8529fb..faac634ac230 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -431,8 +431,6 @@ struct msi_domain_info; * function. * @domain_free_irqs: Optional function to override the default free * function. - * @msi_post_free: Optional function which is invoked after freeing - * all interrupts. * @msi_translate: Optional translate callback to support the odd wire to * MSI bridges, e.g. MBIGEN * @@ -473,8 +471,6 @@ struct msi_domain_ops { struct device *dev, int nvec); void (*domain_free_irqs)(struct irq_domain *domain, struct device *dev); - void (*msi_post_free)(struct irq_domain *domain, - struct device *dev); int (*msi_translate)(struct irq_domain *domain, struct irq_fwspec *fwspec, irq_hw_number_t *hwirq, unsigned int *type); }; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9b09ad3f9914..e7ad99254841 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1644,9 +1644,6 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) else __msi_domain_free_irqs(dev, domain, ctrl); - if (ops->msi_post_free) - ops->msi_post_free(domain, dev); - if (info->flags & MSI_FLAG_FREE_MSI_DESCS) msi_domain_free_descs(dev, ctrl); } -- cgit v1.2.3 From 35758b0032c056cdff3e8f5a70669cb3e2c8d0e4 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:49 +0200 Subject: dibs: Create drivers/dibs Create the file structure for a 'DIBS - Direct Internal Buffer Sharing' shim layer that will provide generic functionality and declarations for dibs device drivers and dibs clients. Following patches will add functionality. Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-4-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- MAINTAINERS | 7 +++++++ drivers/Makefile | 1 + drivers/dibs/Kconfig | 12 ++++++++++++ drivers/dibs/Makefile | 7 +++++++ drivers/dibs/dibs_main.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/dibs.h | 42 ++++++++++++++++++++++++++++++++++++++++++ net/Kconfig | 1 + 7 files changed, 107 insertions(+) create mode 100644 drivers/dibs/Kconfig create mode 100644 drivers/dibs/Makefile create mode 100644 drivers/dibs/dibs_main.c create mode 100644 include/linux/dibs.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index a8a770714101..ecc55fae5f9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7132,6 +7132,13 @@ L: linux-gpio@vger.kernel.org S: Maintained F: drivers/gpio/gpio-gpio-mm.c +DIBS (DIRECT INTERNAL BUFFER SHARING) +M: Alexandra Winter +L: netdev@vger.kernel.org +S: Supported +F: drivers/dibs/ +F: include/linux/dibs.h + DIGITEQ AUTOMOTIVE MGB4 V4L2 DRIVER M: Martin Tuma L: linux-media@vger.kernel.org diff --git a/drivers/Makefile b/drivers/Makefile index b5749cf67044..a104163b1353 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -195,4 +195,5 @@ obj-$(CONFIG_DRM_ACCEL) += accel/ obj-$(CONFIG_CDX_BUS) += cdx/ obj-$(CONFIG_DPLL) += dpll/ +obj-$(CONFIG_DIBS) += dibs/ obj-$(CONFIG_S390) += s390/ diff --git a/drivers/dibs/Kconfig b/drivers/dibs/Kconfig new file mode 100644 index 000000000000..09c12f6838ad --- /dev/null +++ b/drivers/dibs/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0 +config DIBS + tristate "DIBS support" + default n + help + Direct Internal Buffer Sharing (DIBS) + A communication method that uses common physical (internal) memory + for synchronous direct access into a remote buffer. + + Select this option to provide the abstraction layer between + dibs devices and dibs clients like the SMC protocol. + The module name is dibs. diff --git a/drivers/dibs/Makefile b/drivers/dibs/Makefile new file mode 100644 index 000000000000..825dec431bfc --- /dev/null +++ b/drivers/dibs/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# DIBS class module +# + +dibs-y += dibs_main.o +obj-$(CONFIG_DIBS) += dibs.o diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c new file mode 100644 index 000000000000..68e189932fcf --- /dev/null +++ b/drivers/dibs/dibs_main.c @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DIBS - Direct Internal Buffer Sharing + * + * Implementation of the DIBS class module + * + * Copyright IBM Corp. 2025 + */ +#define KMSG_COMPONENT "dibs" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include + +MODULE_DESCRIPTION("Direct Internal Buffer Sharing class"); +MODULE_LICENSE("GPL"); + +/* use an array rather a list for fast mapping: */ +static struct dibs_client *clients[MAX_DIBS_CLIENTS]; +static u8 max_client; + +static int __init dibs_init(void) +{ + memset(clients, 0, sizeof(clients)); + max_client = 0; + + return 0; +} + +static void __exit dibs_exit(void) +{ +} + +module_init(dibs_init); +module_exit(dibs_exit); diff --git a/include/linux/dibs.h b/include/linux/dibs.h new file mode 100644 index 000000000000..3f4175aaa732 --- /dev/null +++ b/include/linux/dibs.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Direct Internal Buffer Sharing + * + * Definitions for the DIBS module + * + * Copyright IBM Corp. 2025 + */ +#ifndef _DIBS_H +#define _DIBS_H + +/* DIBS - Direct Internal Buffer Sharing - concept + * ----------------------------------------------- + * In the case of multiple system sharing the same hardware, dibs fabrics can + * provide dibs devices to these systems. The systems use dibs devices of the + * same fabric to communicate via dmbs (Direct Memory Buffers). Each dmb has + * exactly one owning local dibs device and one remote using dibs device, that + * is authorized to write into this dmb. This access control is provided by the + * dibs fabric. + * + * Because the access to the dmb is based on access to physical memory, it is + * lossless and synchronous. The remote devices can directly access any offset + * of the dmb. + * + * Dibs fabrics, dibs devices and dmbs are identified by tokens and ids. + * Dibs fabric id is unique within the same hardware (with the exception of the + * dibs loopback fabric), dmb token is unique within the same fabric, dibs + * device gids are guaranteed to be unique within the same fabric and + * statistically likely to be globally unique. The exchange of these tokens and + * ids between the systems is not part of the dibs concept. + * + * The dibs layer provides an abstraction between dibs device drivers and dibs + * clients. + */ + +#define MAX_DIBS_CLIENTS 8 + +struct dibs_client { + const char *name; +}; + +#endif /* _DIBS_H */ diff --git a/net/Kconfig b/net/Kconfig index 4b563aea4c23..1d3f757d4b07 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -88,6 +88,7 @@ source "net/tls/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/smc/Kconfig" +source "drivers/dibs/Kconfig" source "net/xdp/Kconfig" config NET_HANDSHAKE -- cgit v1.2.3 From d324a2ca3f8efd57f5839aa2690554a5cbb3586f Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:50 +0200 Subject: dibs: Register smc as dibs_client Formally register smc as dibs client. Functionality will be moved by follow-on patches from ism_client to dibs_client until eventually ism_client can be removed. As DIBS is only a shim layer without any dependencies, we can depend SMC on DIBS without adding indirect dependencies. A follow-on patch will remove dependency of SMC on ISM. Signed-off-by: Alexandra Winter Reviewed-by: Julian Ruess Link: https://patch.msgid.link/20250918110500.1731261-5-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + drivers/dibs/dibs_main.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/dibs.h | 23 +++++++++++++++++++++++ net/smc/Kconfig | 2 +- net/smc/smc_ism.c | 6 ++++++ 6 files changed, 67 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 5e616bc988ac..7bc54f053a3b 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -120,6 +120,7 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_DIBS=y CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 094599cdaf4d..4bf6f3311f7d 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -111,6 +111,7 @@ CONFIG_UNIX=y CONFIG_UNIX_DIAG=m CONFIG_XFRM_USER=m CONFIG_NET_KEY=m +CONFIG_DIBS=y CONFIG_SMC_DIAG=m CONFIG_SMC_LO=y CONFIG_INET=y diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index 68e189932fcf..a5d2be9c3246 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -20,6 +20,41 @@ MODULE_LICENSE("GPL"); /* use an array rather a list for fast mapping: */ static struct dibs_client *clients[MAX_DIBS_CLIENTS]; static u8 max_client; +static DEFINE_MUTEX(clients_lock); + +int dibs_register_client(struct dibs_client *client) +{ + int i, rc = -ENOSPC; + + mutex_lock(&clients_lock); + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { + if (!clients[i]) { + clients[i] = client; + client->id = i; + if (i == max_client) + max_client++; + rc = 0; + break; + } + } + mutex_unlock(&clients_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(dibs_register_client); + +int dibs_unregister_client(struct dibs_client *client) +{ + int rc = 0; + + mutex_lock(&clients_lock); + clients[client->id] = NULL; + if (client->id + 1 == max_client) + max_client--; + mutex_unlock(&clients_lock); + return rc; +} +EXPORT_SYMBOL_GPL(dibs_unregister_client); static int __init dibs_init(void) { diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 3f4175aaa732..7bedeaf52c1b 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -33,10 +33,33 @@ * clients. */ +/* DIBS client + * ----------- + */ #define MAX_DIBS_CLIENTS 8 struct dibs_client { + /* client name for logging and debugging purposes */ const char *name; + /* client index - provided and used by dibs layer */ + u8 id; }; +/* Functions to be called by dibs clients: + */ +/** + * dibs_register_client() - register a client with dibs layer + * @client: this client + * + * Return: zero on success. + */ +int dibs_register_client(struct dibs_client *client); +/** + * dibs_unregister_client() - unregister a client with dibs layer + * @client: this client + * + * Return: zero on success. + */ +int dibs_unregister_client(struct dibs_client *client); + #endif /* _DIBS_H */ diff --git a/net/smc/Kconfig b/net/smc/Kconfig index ba5e6a2dd2fd..40dd60c1d23f 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config SMC tristate "SMC socket protocol family" - depends on INET && INFINIBAND + depends on INET && INFINIBAND && DIBS depends on m || ISM != m help SMC-R provides a "sockets over RDMA" solution making use of diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 503a9f93b392..a7a965e3c0ce 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -18,6 +18,7 @@ #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" +#include "linux/dibs.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -42,6 +43,9 @@ static struct ism_client smc_ism_client = { .handle_irq = smcd_handle_irq, }; #endif +static struct dibs_client smc_dibs_client = { + .name = "SMC-D", +}; static void smc_ism_create_system_eid(void) { @@ -623,11 +627,13 @@ int smc_ism_init(void) #if IS_ENABLED(CONFIG_ISM) rc = ism_register_client(&smc_ism_client); #endif + rc = dibs_register_client(&smc_dibs_client); return rc; } void smc_ism_exit(void) { + dibs_unregister_client(&smc_dibs_client); #if IS_ENABLED(CONFIG_ISM) ism_unregister_client(&smc_ism_client); #endif -- cgit v1.2.3 From 269726968f95ebc00e3a47f91eebd6818991d6fa Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:51 +0200 Subject: dibs: Register ism as dibs device Register ism devices with the dibs layer. Follow-on patches will move functionality to the dibs layer. As DIBS is only a shim layer without any dependencies, we can depend ISM on DIBS without adding indirect dependencies. A follow-on patch will remove implication of SMC by ISM. Define struct dibs_dev. Follow-on patches will move more content into dibs_dev. The goal of follow-on patches is that ism_dev will only contain fields that are special for this device driver. The same concept will apply to other dibs device drivers. Define dibs_dev_alloc(), dibs_dev_add() and dibs_dev_del() to be called by dibs device drivers and call them from ism_drv.c Use ism_dev.dibs for a pointer to dibs_dev. Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-6-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_main.c | 38 +++++++++++++++++++++ drivers/s390/net/Kconfig | 2 +- drivers/s390/net/ism.h | 1 + drivers/s390/net/ism_drv.c | 83 +++++++++++++++++++++++++++++----------------- include/linux/dibs.h | 38 +++++++++++++++++++++ include/linux/ism.h | 1 + 6 files changed, 131 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index a5d2be9c3246..2f420e077417 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -21,6 +22,15 @@ MODULE_LICENSE("GPL"); static struct dibs_client *clients[MAX_DIBS_CLIENTS]; static u8 max_client; static DEFINE_MUTEX(clients_lock); +struct dibs_dev_list { + struct list_head list; + struct mutex mutex; /* protects dibs device list */ +}; + +static struct dibs_dev_list dibs_dev_list = { + .list = LIST_HEAD_INIT(dibs_dev_list.list), + .mutex = __MUTEX_INITIALIZER(dibs_dev_list.mutex), +}; int dibs_register_client(struct dibs_client *client) { @@ -56,6 +66,34 @@ int dibs_unregister_client(struct dibs_client *client) } EXPORT_SYMBOL_GPL(dibs_unregister_client); +struct dibs_dev *dibs_dev_alloc(void) +{ + struct dibs_dev *dibs; + + dibs = kzalloc(sizeof(*dibs), GFP_KERNEL); + + return dibs; +} +EXPORT_SYMBOL_GPL(dibs_dev_alloc); + +int dibs_dev_add(struct dibs_dev *dibs) +{ + mutex_lock(&dibs_dev_list.mutex); + list_add(&dibs->list, &dibs_dev_list.list); + mutex_unlock(&dibs_dev_list.mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(dibs_dev_add); + +void dibs_dev_del(struct dibs_dev *dibs) +{ + mutex_lock(&dibs_dev_list.mutex); + list_del_init(&dibs->list); + mutex_unlock(&dibs_dev_list.mutex); +} +EXPORT_SYMBOL_GPL(dibs_dev_del); + static int __init dibs_init(void) { memset(clients, 0, sizeof(clients)); diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index 2b43f6f28362..92985f595d59 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -81,7 +81,7 @@ config CCWGROUP config ISM tristate "Support for ISM vPCI Adapter" - depends on PCI + depends on PCI && DIBS imply SMC default n help diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index b5b03db52fce..3078779fa71e 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 6cd60b174315..8ecd0cccc7e8 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -599,8 +599,39 @@ static void ism_dev_release(struct device *dev) kfree(ism); } +static void ism_dev_exit(struct ism_dev *ism) +{ + struct pci_dev *pdev = ism->pdev; + unsigned long flags; + int i; + + spin_lock_irqsave(&ism->lock, flags); + for (i = 0; i < max_client; ++i) + ism->subs[i] = NULL; + spin_unlock_irqrestore(&ism->lock, flags); + + mutex_lock(&ism_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->remove(ism); + } + mutex_unlock(&clients_lock); + + if (ism_v2_capable) + ism_del_vlan_id(ism, ISM_RESERVED_VLANID); + unregister_ieq(ism); + unregister_sba(ism); + free_irq(pci_irq_vector(pdev, 0), ism); + kfree(ism->sba_client_arr); + pci_free_irq_vectors(pdev); + list_del_init(&ism->list); + mutex_unlock(&ism_dev_list.mutex); +} + static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) { + struct dibs_dev *dibs; struct ism_dev *ism; int ret; @@ -636,12 +667,28 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) dma_set_max_seg_size(&pdev->dev, SZ_1M); pci_set_master(pdev); + dibs = dibs_dev_alloc(); + if (!dibs) { + ret = -ENOMEM; + goto err_resource; + } + ism->dibs = dibs; + ret = ism_dev_init(ism); if (ret) - goto err_resource; + goto err_dibs; + + ret = dibs_dev_add(dibs); + if (ret) + goto err_ism; return 0; +err_ism: + ism_dev_exit(ism); +err_dibs: + /* pairs with dibs_dev_alloc() */ + kfree(dibs); err_resource: pci_release_mem_regions(pdev); err_disable: @@ -655,41 +702,15 @@ err_dev: return ret; } -static void ism_dev_exit(struct ism_dev *ism) -{ - struct pci_dev *pdev = ism->pdev; - unsigned long flags; - int i; - - spin_lock_irqsave(&ism->lock, flags); - for (i = 0; i < max_client; ++i) - ism->subs[i] = NULL; - spin_unlock_irqrestore(&ism->lock, flags); - - mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < max_client; ++i) { - if (clients[i]) - clients[i]->remove(ism); - } - mutex_unlock(&clients_lock); - - if (ism_v2_capable) - ism_del_vlan_id(ism, ISM_RESERVED_VLANID); - unregister_ieq(ism); - unregister_sba(ism); - free_irq(pci_irq_vector(pdev, 0), ism); - kfree(ism->sba_client_arr); - pci_free_irq_vectors(pdev); - list_del_init(&ism->list); - mutex_unlock(&ism_dev_list.mutex); -} - static void ism_remove(struct pci_dev *pdev) { struct ism_dev *ism = dev_get_drvdata(&pdev->dev); + struct dibs_dev *dibs = ism->dibs; + dibs_dev_del(dibs); ism_dev_exit(ism); + /* pairs with dibs_dev_alloc() */ + kfree(dibs); pci_release_mem_regions(pdev); pci_disable_device(pdev); diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 7bedeaf52c1b..c12db19c98c0 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -9,6 +9,7 @@ #ifndef _DIBS_H #define _DIBS_H +#include /* DIBS - Direct Internal Buffer Sharing - concept * ----------------------------------------------- * In the case of multiple system sharing the same hardware, dibs fabrics can @@ -62,4 +63,41 @@ int dibs_register_client(struct dibs_client *client); */ int dibs_unregister_client(struct dibs_client *client); +/* DIBS devices + * ------------ + */ +struct dibs_dev { + struct list_head list; +}; + +/* ------- End of client-only functions ----------- */ + +/* + * Functions to be called by dibs device drivers: + */ +/** + * dibs_dev_alloc() - allocate and reference device structure + * + * The following fields will be valid upon successful return: dev + * NOTE: Use put_device(dibs_get_dev(@dibs)) to give up your reference instead + * of freeing @dibs @dev directly once you have successfully called this + * function. + * Return: Pointer to dibs device structure + */ +struct dibs_dev *dibs_dev_alloc(void); +/** + * dibs_dev_add() - register with dibs layer and all clients + * @dibs: dibs device + * + * The following fields must be valid upon entry: dev, ops, drv_priv + * All fields will be valid upon successful return. + * Return: zero on success + */ +int dibs_dev_add(struct dibs_dev *dibs); +/** + * dibs_dev_del() - unregister from dibs layer and all clients + * @dibs: dibs device + */ +void dibs_dev_del(struct dibs_dev *dibs); + #endif /* _DIBS_H */ diff --git a/include/linux/ism.h b/include/linux/ism.h index 8358b4cd7ba6..9a53d3c48c16 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -30,6 +30,7 @@ struct ism_dev { spinlock_t lock; /* protects the ism device */ spinlock_t cmd_lock; /* serializes cmds */ struct list_head list; + struct dibs_dev *dibs; struct pci_dev *pdev; struct ism_sba *sba; -- cgit v1.2.3 From 69baaac9361edd169713562f088829a1be9c51a9 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:53 +0200 Subject: dibs: Define dibs_client_ops and dibs_dev_ops Move the device add() and remove() functions from ism_client to dibs_client_ops and call add_dev()/del_dev() for ism devices and dibs_loopback devices. dibs_client_ops->add_dev() = smcd_register_dev() for the smc_dibs_client. This is the first step to handle ism and loopback devices alike (as dibs devices) in the smc dibs client. Define dibs_dev->ops and move smcd_ops->get_chid to dibs_dev_ops->get_fabric_id() for ism and loopback devices. See below for why this needs to be in the same patch as dibs_client_ops->add_dev(). The following changes contain intermediate steps, that will be obsoleted by follow-on patches, once more functionality has been moved to dibs: Use different smcd_ops and max_dmbs for ism and loopback. Follow-on patches will change SMC-D to directly use dibs_ops instead of smcd_ops. In smcd_register_dev() it is now necessary to identify a dibs_loopback device before smcd_dev and smcd_ops->get_chid() are available. So provide dibs_dev_ops->get_fabric_id() in this patch and evaluate it in smc_ism_is_loopback(). Call smc_loopback_init() in smcd_register_dev() and call smc_loopback_exit() in smcd_unregister_dev() to handle the functionality that is still in smc_loopback. Follow-on patches will move all smc_loopback code to dibs_loopback. In smcd_[un]register_dev() use only ism device name, this will be replaced by dibs device name by a follow-on patch. End of changes with intermediate parts. Allocate an smcd event workqueue for all dibs devices, although dibs_loopback does not generate events. Use kernel memory instead of devres memory for smcd_dev and smcd->conn. Since commit a72178cfe855 ("net/smc: Fix dependency of SMC on ISM") an ism device and its driver can have a longer lifetime than the smc module, so smc should not rely on devres to free its resources [1]. It is now the responsibility of the smc client to free smcd and smcd->conn for all dibs devices, ism devices as well as loopback. Call client->ops->del_dev() for all existing dibs devices in dibs_unregister_client(), so all device related structures can be freed in the client. When dibs_unregister_client() is called in the context of smc_exit() or smc_core_reboot_event(), these functions have already called smc_lgrs_shutdown() which calls smc_smcd_terminate_all(smcd) and sets going_away. This is done a second time in smcd_unregister_dev(). This is analogous to how smcr is handled in these functions, by calling first smc_lgrs_shutdown() and then smc_ib_unregister_client() > smc_ib_remove_dev(), so leave it that way. It may be worth investigating, whether smc_lgrs_shutdown() is still required or useful. Remove CONFIG_SMC_LO. CONFIG_DIBS_LO now controls whether a dibs loopback device exists or not. Link: https://www.kernel.org/doc/Documentation/driver-model/devres.txt [1] Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-8-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - drivers/dibs/dibs_loopback.c | 11 ++++ drivers/dibs/dibs_main.c | 36 +++++++++++ drivers/s390/net/ism_drv.c | 43 ++++++------- include/linux/dibs.h | 89 ++++++++++++++++++++++++- include/linux/ism.h | 2 - include/net/smc.h | 3 +- net/smc/Kconfig | 13 ---- net/smc/Makefile | 2 +- net/smc/af_smc.c | 12 +--- net/smc/smc_ism.c | 132 ++++++++++++++++++++++++++------------ net/smc/smc_ism.h | 7 +- net/smc/smc_loopback.c | 94 ++++----------------------- net/smc/smc_loopback.h | 17 +---- 15 files changed, 270 insertions(+), 193 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 5a2ed07b6198..a97c8d19f643 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -123,7 +123,6 @@ CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y CONFIG_SMC_DIAG=m -CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 4cbdd7e2ff9f..7f7b52d9a33c 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -114,7 +114,6 @@ CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y CONFIG_SMC_DIAG=m -CONFIG_SMC_LO=y CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 225514a452a8..215986ae54a4 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -18,6 +18,15 @@ /* global loopback device */ static struct dibs_lo_dev *lo_dev; +static u16 dibs_lo_get_fabric_id(struct dibs_dev *dibs) +{ + return DIBS_LOOPBACK_FABRIC; +} + +static const struct dibs_dev_ops dibs_lo_ops = { + .get_fabric_id = dibs_lo_get_fabric_id, +}; + static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) { dibs_dev_del(ldev->dibs); @@ -40,6 +49,8 @@ static int dibs_lo_dev_probe(void) } ldev->dibs = dibs; + dibs->drv_priv = ldev; + dibs->ops = &dibs_lo_ops; ret = dibs_dev_add(dibs); if (ret) diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index a7e33be36158..f1cfa5849277 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -36,8 +36,10 @@ static struct dibs_dev_list dibs_dev_list = { int dibs_register_client(struct dibs_client *client) { + struct dibs_dev *dibs; int i, rc = -ENOSPC; + mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { if (!clients[i]) { @@ -51,19 +53,37 @@ int dibs_register_client(struct dibs_client *client) } mutex_unlock(&clients_lock); + if (i < MAX_DIBS_CLIENTS) { + /* initialize with all devices that we got so far */ + list_for_each_entry(dibs, &dibs_dev_list.list, list) { + dibs->priv[i] = NULL; + client->ops->add_dev(dibs); + } + } + mutex_unlock(&dibs_dev_list.mutex); + return rc; } EXPORT_SYMBOL_GPL(dibs_register_client); int dibs_unregister_client(struct dibs_client *client) { + struct dibs_dev *dibs; int rc = 0; + mutex_lock(&dibs_dev_list.mutex); + list_for_each_entry(dibs, &dibs_dev_list.list, list) { + clients[client->id]->ops->del_dev(dibs); + dibs->priv[client->id] = NULL; + } + mutex_lock(&clients_lock); clients[client->id] = NULL; if (client->id + 1 == max_client) max_client--; mutex_unlock(&clients_lock); + + mutex_unlock(&dibs_dev_list.mutex); return rc; } EXPORT_SYMBOL_GPL(dibs_unregister_client); @@ -80,7 +100,15 @@ EXPORT_SYMBOL_GPL(dibs_dev_alloc); int dibs_dev_add(struct dibs_dev *dibs) { + int i; + mutex_lock(&dibs_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->ops->add_dev(dibs); + } + mutex_unlock(&clients_lock); list_add(&dibs->list, &dibs_dev_list.list); mutex_unlock(&dibs_dev_list.mutex); @@ -90,7 +118,15 @@ EXPORT_SYMBOL_GPL(dibs_dev_add); void dibs_dev_del(struct dibs_dev *dibs) { + int i; + mutex_lock(&dibs_dev_list.mutex); + mutex_lock(&clients_lock); + for (i = 0; i < max_client; ++i) { + if (clients[i]) + clients[i]->ops->del_dev(dibs); + } + mutex_unlock(&clients_lock); list_del_init(&dibs->list); mutex_unlock(&dibs_dev_list.mutex); } diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 8ecd0cccc7e8..2bd8f64ebb56 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -79,7 +79,6 @@ int ism_register_client(struct ism_client *client) /* initialize with all devices that we got so far */ list_for_each_entry(ism, &ism_dev_list.list, list) { ism->priv[i] = NULL; - client->add(ism); ism_setup_forwarding(client, ism); } } @@ -465,6 +464,16 @@ int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, } EXPORT_SYMBOL_GPL(ism_move); +static u16 ism_get_chid(struct dibs_dev *dibs) +{ + struct ism_dev *ism = dibs->drv_priv; + + if (!ism || !ism->pdev) + return 0; + + return to_zpci(ism->pdev)->pchid; +} + static void ism_handle_event(struct ism_dev *ism) { struct ism_event *entry; @@ -523,6 +532,10 @@ static irqreturn_t ism_handle_irq(int irq, void *data) return IRQ_HANDLED; } +static const struct dibs_dev_ops ism_ops = { + .get_fabric_id = ism_get_chid, +}; + static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; @@ -564,7 +577,6 @@ static int ism_dev_init(struct ism_dev *ism) mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { if (clients[i]) { - clients[i]->add(ism); ism_setup_forwarding(clients[i], ism); } } @@ -611,12 +623,6 @@ static void ism_dev_exit(struct ism_dev *ism) spin_unlock_irqrestore(&ism->lock, flags); mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < max_client; ++i) { - if (clients[i]) - clients[i]->remove(ism); - } - mutex_unlock(&clients_lock); if (ism_v2_capable) ism_del_vlan_id(ism, ISM_RESERVED_VLANID); @@ -672,7 +678,10 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = -ENOMEM; goto err_resource; } + /* set this up before we enable interrupts */ ism->dibs = dibs; + dibs->drv_priv = ism; + dibs->ops = &ism_ops; ret = ism_dev_init(ism); if (ret) @@ -857,19 +866,6 @@ static void smcd_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = 0; } -static u16 ism_get_chid(struct ism_dev *ism) -{ - if (!ism || !ism->pdev) - return 0; - - return to_zpci(ism->pdev)->pchid; -} - -static u16 smcd_get_chid(struct smcd_dev *smcd) -{ - return ism_get_chid(smcd->priv); -} - static inline struct device *smcd_get_dev(struct smcd_dev *dev) { struct ism_dev *ism = dev->priv; @@ -877,7 +873,7 @@ static inline struct device *smcd_get_dev(struct smcd_dev *dev) return &ism->dev; } -static const struct smcd_ops ism_ops = { +static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, @@ -889,13 +885,12 @@ static const struct smcd_ops ism_ops = { .move_data = smcd_move, .supports_v2 = smcd_supports_v2, .get_local_gid = smcd_get_local_gid, - .get_chid = smcd_get_chid, .get_dev = smcd_get_dev, }; const struct smcd_ops *ism_get_smcd_ops(void) { - return &ism_ops; + return &ism_smcd_ops; } EXPORT_SYMBOL_GPL(ism_get_smcd_ops); #endif diff --git a/include/linux/dibs.h b/include/linux/dibs.h index c12db19c98c0..805ab33271b5 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -34,14 +34,45 @@ * clients. */ +struct dibs_dev; + /* DIBS client * ----------- */ #define MAX_DIBS_CLIENTS 8 +/* All dibs clients have access to all dibs devices. + * A dibs client provides the following functions to be called by dibs layer or + * dibs device drivers: + */ +struct dibs_client_ops { + /** + * add_dev() - add a dibs device + * @dev: device that was added + * + * Will be called during dibs_register_client() for all existing + * dibs devices and whenever a new dibs device is registered. + * dev is usable until dibs_client.remove() is called. + * *dev is protected by device refcounting. + */ + void (*add_dev)(struct dibs_dev *dev); + /** + * del_dev() - remove a dibs device + * @dev: device to be removed + * + * Will be called whenever a dibs device is removed. + * Will be called during dibs_unregister_client() for all existing + * dibs devices and whenever a dibs device is unregistered. + * The device has already stopped initiative for this client: + * No new handlers will be started. + * The device is no longer usable by this client after this call. + */ + void (*del_dev)(struct dibs_dev *dev); +}; struct dibs_client { /* client name for logging and debugging purposes */ const char *name; + const struct dibs_client_ops *ops; /* client index - provided and used by dibs layer */ u8 id; }; @@ -52,6 +83,7 @@ struct dibs_client { * dibs_register_client() - register a client with dibs layer * @client: this client * + * Will call client->ops->add_dev() for all existing dibs devices. * Return: zero on success. */ int dibs_register_client(struct dibs_client *client); @@ -59,21 +91,74 @@ int dibs_register_client(struct dibs_client *client); * dibs_unregister_client() - unregister a client with dibs layer * @client: this client * + * Will call client->ops->del_dev() for all existing dibs devices. * Return: zero on success. */ int dibs_unregister_client(struct dibs_client *client); +/* dibs clients can call dibs device ops. */ + /* DIBS devices * ------------ */ + +/* Defined fabric id / CHID for all loopback devices: + * All dibs loopback devices report this fabric id. In this case devices with + * the same fabric id can NOT communicate via dibs. Only loopback devices with + * the same dibs device gid can communicate (=same device with itself). + */ +#define DIBS_LOOPBACK_FABRIC 0xFFFF + +/* A dibs device provides the following functions to be called by dibs clients. + * They are mandatory, unless marked 'optional'. + */ +struct dibs_dev_ops { + /** + * get_fabric_id() + * @dev: local dibs device + * + * Only devices on the same dibs fabric can communicate. Fabric_id is + * unique inside the same HW system. Use fabric_id for fast negative + * checks, but only query_remote_gid() can give a reliable positive + * answer: + * Different fabric_id: dibs is not possible + * Same fabric_id: dibs may be possible or not + * (e.g. different HW systems) + * EXCEPTION: DIBS_LOOPBACK_FABRIC denotes an ism_loopback device + * that can only communicate with itself. Use dibs_dev.gid + * or query_remote_gid()to determine whether sender and + * receiver use the same ism_loopback device. + * Return: 2 byte dibs fabric id + */ + u16 (*get_fabric_id)(struct dibs_dev *dev); +}; + struct dibs_dev { struct list_head list; + /* To be filled by device driver, before calling dibs_dev_add(): */ + const struct dibs_dev_ops *ops; + /* priv pointer for device driver */ + void *drv_priv; + + /* priv pointer per client; for client usage only */ + void *priv[MAX_DIBS_CLIENTS]; }; +static inline void dibs_set_priv(struct dibs_dev *dev, + struct dibs_client *client, void *priv) +{ + dev->priv[client->id] = priv; +} + +static inline void *dibs_get_priv(struct dibs_dev *dev, + struct dibs_client *client) +{ + return dev->priv[client->id]; +} + /* ------- End of client-only functions ----------- */ -/* - * Functions to be called by dibs device drivers: +/* Functions to be called by dibs device drivers: */ /** * dibs_dev_alloc() - allocate and reference device structure diff --git a/include/linux/ism.h b/include/linux/ism.h index 9a53d3c48c16..c818a25996db 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -59,8 +59,6 @@ struct ism_event { struct ism_client { const char *name; - void (*add)(struct ism_dev *dev); - void (*remove)(struct ism_dev *dev); void (*handle_event)(struct ism_dev *dev, struct ism_event *event); /* Parameter dmbemask contains a bit vector with updated DMBEs, if sent * via ism_move_data(). Callback function must handle all active bits diff --git a/include/net/smc.h b/include/net/smc.h index a9c023dd1380..e271891b85e6 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -15,6 +15,7 @@ #include #include #include +#include #include "linux/ism.h" struct sock; @@ -62,7 +63,6 @@ struct smcd_ops { unsigned int size); int (*supports_v2)(void); void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); - u16 (*get_chid)(struct smcd_dev *dev); struct device* (*get_dev)(struct smcd_dev *dev); /* optional operations */ @@ -81,6 +81,7 @@ struct smcd_dev { const struct smcd_ops *ops; void *priv; void *client; + struct dibs_dev *dibs; struct list_head list; spinlock_t lock; struct smc_connection **conn; diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 40dd60c1d23f..9535d88c2acb 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -20,16 +20,3 @@ config SMC_DIAG smcss. if unsure, say Y. - -config SMC_LO - bool "SMC intra-OS shortcut with loopback-ism" - depends on SMC - default n - help - SMC_LO enables the creation of an Emulated-ISM device named - loopback-ism in SMC and makes use of it for transferring data - when communication occurs within the same OS. This helps in - convenient testing of SMC-D since loopback-ism is independent - of architecture or hardware. - - if unsure, say N. diff --git a/net/smc/Makefile b/net/smc/Makefile index 60f1c87d5212..96ccfdf246df 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,4 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o -smc-$(CONFIG_SMC_LO) += smc_loopback.o +smc-y += smc_loopback.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9097e4f24d2b..77b99e8ef35a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -57,7 +57,6 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" -#include "smc_loopback.h" #include "smc_inet.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group @@ -3591,16 +3590,10 @@ static int __init smc_init(void) goto out_sock; } - rc = smc_loopback_init(); - if (rc) { - pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc); - goto out_ib; - } - rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_lo; + goto out_ib; } rc = smc_inet_init(); if (rc) { @@ -3611,8 +3604,6 @@ static int __init smc_init(void) return 0; out_ulp: tcp_unregister_ulp(&smc_ulp_ops); -out_lo: - smc_loopback_exit(); out_ib: smc_ib_unregister_client(); out_sock: @@ -3651,7 +3642,6 @@ static void __exit smc_exit(void) tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); - smc_loopback_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index a7a965e3c0ce..415f03910c91 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -15,6 +15,7 @@ #include "smc.h" #include "smc_core.h" #include "smc_ism.h" +#include "smc_loopback.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" @@ -28,23 +29,27 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; +static void smcd_register_dev(struct dibs_dev *dibs); +static void smcd_unregister_dev(struct dibs_dev *dibs); #if IS_ENABLED(CONFIG_ISM) -static void smcd_register_dev(struct ism_dev *ism); -static void smcd_unregister_dev(struct ism_dev *ism); static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", - .add = smcd_register_dev, - .remove = smcd_unregister_dev, .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; #endif +static struct dibs_client_ops smc_client_ops = { + .add_dev = smcd_register_dev, + .del_dev = smcd_unregister_dev, +}; + static struct dibs_client smc_dibs_client = { .name = "SMC-D", + .ops = &smc_client_ops, }; static void smc_ism_create_system_eid(void) @@ -86,7 +91,7 @@ void smc_ism_get_system_eid(u8 **eid) u16 smc_ism_get_chid(struct smcd_dev *smcd) { - return smcd->ops->get_chid(smcd); + return smcd->dibs->ops->get_fabric_id(smcd->dibs); } /* HW supports ISM V2 and thus System EID is defined */ @@ -318,7 +323,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); + smc_set_pci_values(ism->pdev, &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -368,7 +373,7 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; - if (smc_ism_is_loopback(smcd)) + if (smc_ism_is_loopback(smcd->dibs)) goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; @@ -453,24 +458,26 @@ static void smc_ism_event_work(struct work_struct *work) } kfree(wrk); } +#endif -static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, + const struct smcd_ops *ops, + int max_dmbs) { struct smcd_dev *smcd; - smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; - smcd->conn = devm_kcalloc(parent, max_dmbs, - sizeof(struct smc_connection *), GFP_KERNEL); + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); if (!smcd->conn) - return NULL; + goto free_smcd; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) - return NULL; + goto free_conn; smcd->ops = ops; @@ -480,27 +487,58 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; + +free_conn: + kfree(smcd->conn); +free_smcd: + kfree(smcd); + return NULL; } -static void smcd_register_dev(struct ism_dev *ism) +static void smcd_register_dev(struct dibs_dev *dibs) { - const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd, *fentry; + const struct smcd_ops *ops; + struct smc_lo_dev *smc_lo; + struct ism_dev *ism; - if (!ops) - return; + if (smc_ism_is_loopback(dibs)) { + if (smc_loopback_init(&smc_lo)) + return; + } - smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, - ISM_NR_DMBS); + if (smc_ism_is_loopback(dibs)) { + ops = smc_lo_get_smcd_ops(); + smcd = smcd_alloc_dev(dev_name(&smc_lo->dev), ops, + SMC_LO_MAX_DMBS); + } else { + ism = dibs->drv_priv; +#if IS_ENABLED(CONFIG_ISM) + ops = ism_get_smcd_ops(); +#endif + smcd = smcd_alloc_dev(dev_name(&ism->pdev->dev), ops, + ISM_NR_DMBS); + } if (!smcd) return; - smcd->priv = ism; - smcd->client = &smc_ism_client; - ism_set_priv(ism, &smc_ism_client, smcd); - if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); - if (smcd->ops->supports_v2()) + smcd->dibs = dibs; + dibs_set_priv(dibs, &smc_dibs_client, smcd); + + if (smc_ism_is_loopback(dibs)) { + smcd->priv = smc_lo; + smc_lo->smcd = smcd; + } else { + smcd->priv = ism; +#if IS_ENABLED(CONFIG_ISM) + ism_set_priv(ism, &smc_ism_client, smcd); + smcd->client = &smc_ism_client; +#endif + if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); + } + + if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); /* sort list: @@ -510,7 +548,7 @@ static void smcd_register_dev(struct ism_dev *ism) if (!smcd->pnetid[0]) { fentry = list_first_entry_or_null(&smcd_dev_list.list, struct smcd_dev, list); - if (fentry && smc_ism_is_loopback(fentry)) + if (fentry && smc_ism_is_loopback(fentry->dibs)) list_add(&smcd->list, &fentry->list); else list_add(&smcd->list, &smcd_dev_list.list); @@ -519,32 +557,46 @@ static void smcd_register_dev(struct ism_dev *ism) } mutex_unlock(&smcd_dev_list.mutex); - if (smc_pnet_is_pnetid_set(smcd->pnetid)) - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? - " (user defined)" : - ""); - else - pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", - dev_name(&ism->dev)); + if (smc_ism_is_loopback(dibs)) { + pr_warn_ratelimited("smc: adding smcd loopback device\n"); + } else { + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&ism->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&ism->dev)); + } return; } -static void smcd_unregister_dev(struct ism_dev *ism) +static void smcd_unregister_dev(struct dibs_dev *dibs) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); + struct ism_dev *ism = dibs->drv_priv; - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ism->dev)); + if (smc_ism_is_loopback(dibs)) { + pr_warn_ratelimited("smc: removing smcd loopback device\n"); + } else { + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&ism->dev)); + } smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); + if (smc_ism_is_loopback(dibs)) + smc_loopback_exit(); + kfree(smcd->conn); + kfree(smcd); } +#if IS_ENABLED(CONFIG_ISM) /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 765aa8fae6fa..04699951d03f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "smc.h" @@ -85,14 +86,14 @@ static inline bool __smc_ism_is_emulated(u16 chid) static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) { - u16 chid = smcd->ops->get_chid(smcd); + u16 chid = smcd->dibs->ops->get_fabric_id(smcd->dibs); return __smc_ism_is_emulated(chid); } -static inline bool smc_ism_is_loopback(struct smcd_dev *smcd) +static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) { - return (smcd->ops->get_chid(smcd) == 0xFFFF); + return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); } #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 1853c26fbbbb..37d8366419f7 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -35,8 +35,6 @@ static void smc_lo_generate_ids(struct smc_lo_dev *ldev) memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), sizeof(lgid->gid_ext)); - - ldev->chid = SMC_LO_RESERVED_CHID; } static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, @@ -257,11 +255,6 @@ static void smc_lo_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = ldev->local_gid.gid_ext; } -static u16 smc_lo_get_chid(struct smcd_dev *smcd) -{ - return ((struct smc_lo_dev *)smcd->priv)->chid; -} - static struct device *smc_lo_get_dev(struct smcd_dev *smcd) { return &((struct smc_lo_dev *)smcd->priv)->dev; @@ -281,72 +274,15 @@ static const struct smcd_ops lo_ops = { .signal_event = NULL, .move_data = smc_lo_move_data, .get_local_gid = smc_lo_get_local_gid, - .get_chid = smc_lo_get_chid, .get_dev = smc_lo_get_dev, }; -static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops, - int max_dmbs) -{ - struct smcd_dev *smcd; - - smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); - if (!smcd) - return NULL; - - smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), - GFP_KERNEL); - if (!smcd->conn) - goto out_smcd; - - smcd->ops = ops; - - spin_lock_init(&smcd->lock); - spin_lock_init(&smcd->lgr_lock); - INIT_LIST_HEAD(&smcd->vlan); - INIT_LIST_HEAD(&smcd->lgr_list); - init_waitqueue_head(&smcd->lgrs_deleted); - return smcd; - -out_smcd: - kfree(smcd); - return NULL; -} - -static int smcd_lo_register_dev(struct smc_lo_dev *ldev) -{ - struct smcd_dev *smcd; - - smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS); - if (!smcd) - return -ENOMEM; - ldev->smcd = smcd; - smcd->priv = ldev; - smc_ism_set_v2_capable(); - mutex_lock(&smcd_dev_list.mutex); - list_add(&smcd->list, &smcd_dev_list.list); - mutex_unlock(&smcd_dev_list.mutex); - pr_warn_ratelimited("smc: adding smcd device %s\n", - dev_name(&ldev->dev)); - return 0; -} - -static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev) +const struct smcd_ops *smc_lo_get_smcd_ops(void) { - struct smcd_dev *smcd = ldev->smcd; - - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ldev->dev)); - smcd->going_away = 1; - smc_smcd_terminate_all(smcd); - mutex_lock(&smcd_dev_list.mutex); - list_del_init(&smcd->list); - mutex_unlock(&smcd_dev_list.mutex); - kfree(smcd->conn); - kfree(smcd); + return &lo_ops; } -static int smc_lo_dev_init(struct smc_lo_dev *ldev) +static void smc_lo_dev_init(struct smc_lo_dev *ldev) { smc_lo_generate_ids(ldev); rwlock_init(&ldev->dmb_ht_lock); @@ -354,12 +290,11 @@ static int smc_lo_dev_init(struct smc_lo_dev *ldev) atomic_set(&ldev->dmb_cnt, 0); init_waitqueue_head(&ldev->ldev_release); - return smcd_lo_register_dev(ldev); + return; } static void smc_lo_dev_exit(struct smc_lo_dev *ldev) { - smcd_lo_unregister_dev(ldev); if (atomic_read(&ldev->dmb_cnt)) wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); } @@ -375,7 +310,6 @@ static void smc_lo_dev_release(struct device *dev) static int smc_lo_dev_probe(void) { struct smc_lo_dev *ldev; - int ret; ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); if (!ldev) @@ -385,17 +319,11 @@ static int smc_lo_dev_probe(void) ldev->dev.release = smc_lo_dev_release; device_initialize(&ldev->dev); dev_set_name(&ldev->dev, smc_lo_dev_name); - - ret = smc_lo_dev_init(ldev); - if (ret) - goto free_dev; + smc_lo_dev_init(ldev); lo_dev = ldev; /* global loopback device */ - return 0; -free_dev: - put_device(&ldev->dev); - return ret; + return 0; } static void smc_lo_dev_remove(void) @@ -405,11 +333,17 @@ static void smc_lo_dev_remove(void) smc_lo_dev_exit(lo_dev); put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ + lo_dev = NULL; } -int smc_loopback_init(void) +int smc_loopback_init(struct smc_lo_dev **smc_lb) { - return smc_lo_dev_probe(); + int ret; + + ret = smc_lo_dev_probe(); + if (!ret) + *smc_lb = lo_dev; + return ret; } void smc_loopback_exit(void) diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index 04dc6808d2e1..76c62526e2e5 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -17,10 +17,8 @@ #include #include -#if IS_ENABLED(CONFIG_SMC_LO) #define SMC_LO_MAX_DMBS 5000 #define SMC_LO_DMBS_HASH_BITS 12 -#define SMC_LO_RESERVED_CHID 0xFFFF struct smc_lo_dmb_node { struct hlist_node list; @@ -35,7 +33,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; struct device dev; - u16 chid; struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; @@ -44,17 +41,9 @@ struct smc_lo_dev { wait_queue_head_t ldev_release; }; -int smc_loopback_init(void); +const struct smcd_ops *smc_lo_get_smcd_ops(void); + +int smc_loopback_init(struct smc_lo_dev **smc_lb); void smc_loopback_exit(void); -#else -static inline int smc_loopback_init(void) -{ - return 0; -} - -static inline void smc_loopback_exit(void) -{ -} -#endif #endif /* _SMC_LOOPBACK_H */ -- cgit v1.2.3 From 845c334a0186a23c2ac4abfb444e499fec831b24 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Thu, 18 Sep 2025 13:04:54 +0200 Subject: dibs: Move struct device to dibs_dev Move struct device from ism_dev and smc_lo_dev to dibs_dev, and define a corresponding release function. Free ism_dev in ism_remove() and smc_lo_dev in smc_lo_dev_remove(). Replace smcd->ops->get_dev(smcd) by using dibs->dev directly. An alternative design would be to embed dibs_dev as a field in ism_dev and do the same for other dibs device driver specific structs. However that would have the disadvantage that each dibs device driver needs to allocate dibs_dev and each dibs device driver needs a different device release function. The advantage would be that ism_dev and other device driver specific structs would be covered by device reference counts. Signed-off-by: Julian Ruess Co-developed-by: Alexandra Winter Signed-off-by: Alexandra Winter Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-9-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 15 +++++++-------- drivers/dibs/dibs_main.c | 21 +++++++++++++++++++- drivers/s390/net/ism_drv.c | 40 ++++++++------------------------------ include/linux/dibs.h | 1 + include/linux/ism.h | 1 - include/net/smc.h | 1 - net/smc/smc_core.c | 4 ++-- net/smc/smc_ism.c | 46 ++++++++++++++++++-------------------------- net/smc/smc_loopback.c | 21 +------------------- net/smc/smc_loopback.h | 1 - net/smc/smc_pnet.c | 13 +++++-------- 11 files changed, 63 insertions(+), 101 deletions(-) (limited to 'include/linux') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 215986ae54a4..76e479d5724b 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -15,6 +15,7 @@ #include "dibs_loopback.h" +static const char dibs_lo_dev_name[] = "lo"; /* global loopback device */ static struct dibs_lo_dev *lo_dev; @@ -27,11 +28,6 @@ static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, }; -static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) -{ - dibs_dev_del(ldev->dibs); -} - static int dibs_lo_dev_probe(void) { struct dibs_lo_dev *ldev; @@ -52,6 +48,9 @@ static int dibs_lo_dev_probe(void) dibs->drv_priv = ldev; dibs->ops = &dibs_lo_ops; + dibs->dev.parent = NULL; + dev_set_name(&dibs->dev, "%s", dibs_lo_dev_name); + ret = dibs_dev_add(dibs); if (ret) goto err_reg; @@ -60,7 +59,7 @@ static int dibs_lo_dev_probe(void) err_reg: /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); kfree(ldev); return ret; @@ -71,9 +70,9 @@ static void dibs_lo_dev_remove(void) if (!lo_dev) return; - dibs_lo_dev_exit(lo_dev); + dibs_dev_del(lo_dev->dibs); /* pairs with dibs_dev_alloc() */ - kfree(lo_dev->dibs); + put_device(&lo_dev->dibs->dev); kfree(lo_dev); lo_dev = NULL; } diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index f1cfa5849277..610b6c452211 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -88,11 +88,24 @@ int dibs_unregister_client(struct dibs_client *client) } EXPORT_SYMBOL_GPL(dibs_unregister_client); +static void dibs_dev_release(struct device *dev) +{ + struct dibs_dev *dibs; + + dibs = container_of(dev, struct dibs_dev, dev); + + kfree(dibs); +} + struct dibs_dev *dibs_dev_alloc(void) { struct dibs_dev *dibs; dibs = kzalloc(sizeof(*dibs), GFP_KERNEL); + if (!dibs) + return dibs; + dibs->dev.release = dibs_dev_release; + device_initialize(&dibs->dev); return dibs; } @@ -100,7 +113,11 @@ EXPORT_SYMBOL_GPL(dibs_dev_alloc); int dibs_dev_add(struct dibs_dev *dibs) { - int i; + int i, ret; + + ret = device_add(&dibs->dev); + if (ret) + return ret; mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); @@ -129,6 +146,8 @@ void dibs_dev_del(struct dibs_dev *dibs) mutex_unlock(&clients_lock); list_del_init(&dibs->list); mutex_unlock(&dibs_dev_list.mutex); + + device_del(&dibs->dev); } EXPORT_SYMBOL_GPL(dibs_dev_del); diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 2bd8f64ebb56..4096ea9faa7e 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -602,15 +602,6 @@ out: return ret; } -static void ism_dev_release(struct device *dev) -{ - struct ism_dev *ism; - - ism = container_of(dev, struct ism_dev, dev); - - kfree(ism); -} - static void ism_dev_exit(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; @@ -649,17 +640,10 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) spin_lock_init(&ism->cmd_lock); dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; - ism->dev.parent = &pdev->dev; - ism->dev.release = ism_dev_release; - device_initialize(&ism->dev); - dev_set_name(&ism->dev, "%s", dev_name(&pdev->dev)); - ret = device_add(&ism->dev); - if (ret) - goto err_dev; ret = pci_enable_device_mem(pdev); if (ret) - goto err; + goto err_dev; ret = pci_request_mem_regions(pdev, DRV_NAME); if (ret) @@ -687,6 +671,9 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (ret) goto err_dibs; + dibs->dev.parent = &pdev->dev; + dev_set_name(&dibs->dev, "%s", dev_name(&pdev->dev)); + ret = dibs_dev_add(dibs); if (ret) goto err_ism; @@ -697,16 +684,14 @@ err_ism: ism_dev_exit(ism); err_dibs: /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); err_resource: pci_release_mem_regions(pdev); err_disable: pci_disable_device(pdev); -err: - device_del(&ism->dev); err_dev: dev_set_drvdata(&pdev->dev, NULL); - put_device(&ism->dev); + kfree(ism); return ret; } @@ -719,13 +704,12 @@ static void ism_remove(struct pci_dev *pdev) dibs_dev_del(dibs); ism_dev_exit(ism); /* pairs with dibs_dev_alloc() */ - kfree(dibs); + put_device(&dibs->dev); pci_release_mem_regions(pdev); pci_disable_device(pdev); - device_del(&ism->dev); dev_set_drvdata(&pdev->dev, NULL); - put_device(&ism->dev); + kfree(ism); } static struct pci_driver ism_driver = { @@ -866,13 +850,6 @@ static void smcd_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = 0; } -static inline struct device *smcd_get_dev(struct smcd_dev *dev) -{ - struct ism_dev *ism = dev->priv; - - return &ism->dev; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, @@ -885,7 +862,6 @@ static const struct smcd_ops ism_smcd_ops = { .move_data = smcd_move, .supports_v2 = smcd_supports_v2, .get_local_gid = smcd_get_local_gid, - .get_dev = smcd_get_dev, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 805ab33271b5..793c6e1ece0f 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -135,6 +135,7 @@ struct dibs_dev_ops { struct dibs_dev { struct list_head list; + struct device dev; /* To be filled by device driver, before calling dibs_dev_add(): */ const struct dibs_dev_ops *ops; /* priv pointer for device driver */ diff --git a/include/linux/ism.h b/include/linux/ism.h index c818a25996db..84f1afb3dded 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -42,7 +42,6 @@ struct ism_dev { struct ism_eq *ieq; dma_addr_t ieq_dma_addr; - struct device dev; u64 local_gid; int ieq_idx; diff --git a/include/net/smc.h b/include/net/smc.h index e271891b85e6..05faac83371e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -63,7 +63,6 @@ struct smcd_ops { unsigned int size); int (*supports_v2)(void); void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); - struct device* (*get_dev)(struct smcd_dev *dev); /* optional operations */ int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a9e80f44307d..42ab0795d563 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -924,7 +924,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) if (ini->is_smcd) { /* SMC-D specific settings */ smcd = ini->ism_dev[ini->ism_selected]; - get_device(smcd->ops->get_dev(smcd)); + get_device(&smcd->dibs->dev); lgr->peer_gid.gid = ini->ism_peer_gid[ini->ism_selected].gid; lgr->peer_gid.gid_ext = @@ -1474,7 +1474,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(lgr->smcd->ops->get_dev(lgr->smcd)); + put_device(&lgr->smcd->dibs->dev); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 415f03910c91..6a6e7c9641e8 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -303,12 +303,12 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; + struct dibs_dev *dibs; struct nlattr *attrs; - struct ism_dev *ism; int use_cnt = 0; void *nlh; - ism = smcd->priv; + dibs = smcd->dibs; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); @@ -323,7 +323,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(ism->pdev, &smc_pci_dev); + smc_set_pci_values(to_pci_dev(dibs->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -509,14 +509,14 @@ static void smcd_register_dev(struct dibs_dev *dibs) if (smc_ism_is_loopback(dibs)) { ops = smc_lo_get_smcd_ops(); - smcd = smcd_alloc_dev(dev_name(&smc_lo->dev), ops, + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, SMC_LO_MAX_DMBS); } else { ism = dibs->drv_priv; #if IS_ENABLED(CONFIG_ISM) ops = ism_get_smcd_ops(); #endif - smcd = smcd_alloc_dev(dev_name(&ism->pdev->dev), ops, + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, ISM_NR_DMBS); } if (!smcd) @@ -534,10 +534,11 @@ static void smcd_register_dev(struct dibs_dev *dibs) ism_set_priv(ism, &smc_ism_client, smcd); smcd->client = &smc_ism_client; #endif - if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); } + if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); + if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); @@ -557,33 +558,24 @@ static void smcd_register_dev(struct dibs_dev *dibs) } mutex_unlock(&smcd_dev_list.mutex); - if (smc_ism_is_loopback(dibs)) { - pr_warn_ratelimited("smc: adding smcd loopback device\n"); - } else { - if (smc_pnet_is_pnetid_set(smcd->pnetid)) - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? - " (user defined)" : - ""); - else - pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", - dev_name(&ism->dev)); - } + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&dibs->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&dibs->dev)); return; } static void smcd_unregister_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); - struct ism_dev *ism = dibs->drv_priv; - if (smc_ism_is_loopback(dibs)) { - pr_warn_ratelimited("smc: removing smcd loopback device\n"); - } else { - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ism->dev)); - } + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&dibs->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 37d8366419f7..262d0d0df4d0 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -23,7 +23,6 @@ #define SMC_LO_SUPPORT_NOCOPY 0x1 #define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) -static const char smc_lo_dev_name[] = "loopback-ism"; static struct smc_lo_dev *lo_dev; static void smc_lo_generate_ids(struct smc_lo_dev *ldev) @@ -255,11 +254,6 @@ static void smc_lo_get_local_gid(struct smcd_dev *smcd, smcd_gid->gid_ext = ldev->local_gid.gid_ext; } -static struct device *smc_lo_get_dev(struct smcd_dev *smcd) -{ - return &((struct smc_lo_dev *)smcd->priv)->dev; -} - static const struct smcd_ops lo_ops = { .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, @@ -274,7 +268,6 @@ static const struct smcd_ops lo_ops = { .signal_event = NULL, .move_data = smc_lo_move_data, .get_local_gid = smc_lo_get_local_gid, - .get_dev = smc_lo_get_dev, }; const struct smcd_ops *smc_lo_get_smcd_ops(void) @@ -299,14 +292,6 @@ static void smc_lo_dev_exit(struct smc_lo_dev *ldev) wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); } -static void smc_lo_dev_release(struct device *dev) -{ - struct smc_lo_dev *ldev = - container_of(dev, struct smc_lo_dev, dev); - - kfree(ldev); -} - static int smc_lo_dev_probe(void) { struct smc_lo_dev *ldev; @@ -315,10 +300,6 @@ static int smc_lo_dev_probe(void) if (!ldev) return -ENOMEM; - ldev->dev.parent = NULL; - ldev->dev.release = smc_lo_dev_release; - device_initialize(&ldev->dev); - dev_set_name(&ldev->dev, smc_lo_dev_name); smc_lo_dev_init(ldev); lo_dev = ldev; /* global loopback device */ @@ -332,7 +313,7 @@ static void smc_lo_dev_remove(void) return; smc_lo_dev_exit(lo_dev); - put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ + kfree(lo_dev); lo_dev = NULL; } diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index 76c62526e2e5..a033bf10890a 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -32,7 +32,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; - struct device dev; struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 7225b5fa17a6..d0df7f2b03aa 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -169,7 +169,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", - dev_name(smcd->ops->get_dev(smcd)), + dev_name(&smcd->dibs->dev), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; @@ -332,7 +332,7 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), + if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name, IB_DEVICE_NAME_MAX - 1)) goto out; } @@ -413,7 +413,6 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; - struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ @@ -431,10 +430,8 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { - dev = smcd->ops->get_dev(smcd); - pr_warn_ratelimited("smc: smcd device %s " - "applied user defined pnetid " - "%.16s\n", dev_name(dev), + pr_warn_ratelimited("smc: smcd device %s applied user defined pnetid %.16s\n", + dev_name(&smcd->dibs->dev), smcd->pnetid); } } @@ -1193,7 +1190,7 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); + const char *ib_name = dev_name(&smcddev->dibs->dev); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; -- cgit v1.2.3 From 05e68d8dedf34f270cc3769ffe7f0ed413f23add Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:56 +0200 Subject: dibs: Local gid for dibs devices Define a uuid_t GID attribute to identify a dibs device. SMC uses 64 Bit and 128 Bit Global Identifiers (GIDs) per device, that need to be sent via the SMC protocol. Because the smc code uses integers, network endianness and host endianness need to be considered. Avoid this in the dibs layer by using uuid_t byte arrays. Future patches could change SMC to use uuid_t. For now conversion helper functions are introduced. ISM devices provide 64 Bit GIDs. Map them to dibs uuid_t GIDs like this: _________________________________________ | 64 Bit ISM-vPCI GID | 00000000_00000000 | ----------------------------------------- If interpreted as UUID [1], this would be interpreted as the UIID variant, that is reserved for NCS backward compatibility. So it will not collide with UUIDs that were generated according to the standard. smc_loopback already uses version 4 UUIDs as 128 Bit GIDs, move that to dibs loopback. A temporary change to smc_lo_query_rgid() is required, that will be moved to dibs_loopback with a follow-on patch. Provide gid of a dibs device as sysfs read-only attribute. Link: https://datatracker.ietf.org/doc/html/rfc4122 [1] Signed-off-by: Alexandra Winter Reviewed-by: Julian Ruess Reviewed-by: Mahanta Jambigi Link: https://patch.msgid.link/20250918110500.1731261-11-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 1 + drivers/dibs/dibs_main.c | 12 ++++++++++++ drivers/s390/net/ism.h | 9 +++++++++ drivers/s390/net/ism_drv.c | 30 +++++++++--------------------- include/linux/dibs.h | 3 +++ include/linux/ism.h | 1 - include/net/smc.h | 1 - net/smc/smc_clc.c | 6 +++--- net/smc/smc_core.c | 2 +- net/smc/smc_diag.c | 2 +- net/smc/smc_ism.h | 22 ++++++++++++++++++++++ net/smc/smc_loopback.c | 29 ++++------------------------- net/smc/smc_loopback.h | 1 - 13 files changed, 65 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 76e479d5724b..d7e6fa5e90f3 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -46,6 +46,7 @@ static int dibs_lo_dev_probe(void) ldev->dibs = dibs; dibs->drv_priv = ldev; + uuid_gen(&dibs->gid); dibs->ops = &dibs_lo_ops; dibs->dev.parent = NULL; diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index b3f21805aa59..f20ed0594a51 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -114,6 +114,17 @@ struct dibs_dev *dibs_dev_alloc(void) } EXPORT_SYMBOL_GPL(dibs_dev_alloc); +static ssize_t gid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct dibs_dev *dibs; + + dibs = container_of(dev, struct dibs_dev, dev); + + return sysfs_emit(buf, "%pUb\n", &dibs->gid); +} +static DEVICE_ATTR_RO(gid); + static ssize_t fabric_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -128,6 +139,7 @@ static ssize_t fabric_id_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_RO(fabric_id); static struct attribute *dibs_dev_attrs[] = { + &dev_attr_gid.attr, &dev_attr_fabric_id.attr, NULL, }; diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 3078779fa71e..1b9fa14da20c 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -67,6 +67,15 @@ union ism_reg_ieq { } response; } __aligned(16); +/* ISM-vPCI devices provide 64 Bit GIDs + * Map them to ISM UUID GIDs like this: + * _________________________________________ + * | 64 Bit ISM-vPCI GID | 00000000_00000000 | + * ----------------------------------------- + * This will be interpreted as a UIID variant, that is reserved + * for NCS backward compatibility. So it will not collide with + * proper UUIDs. + */ union ism_read_gid { struct { struct ism_req_hdr hdr; diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index ab1d61eb3e3b..e58c55fb03c2 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -272,8 +272,9 @@ static int unregister_ieq(struct ism_dev *ism) return 0; } -static int ism_read_local_gid(struct ism_dev *ism) +static int ism_read_local_gid(struct dibs_dev *dibs) { + struct ism_dev *ism = dibs->drv_priv; union ism_read_gid cmd; int ret; @@ -285,7 +286,8 @@ static int ism_read_local_gid(struct ism_dev *ism) if (ret) goto out; - ism->local_gid = cmd.response.gid; + memset(&dibs->gid, 0, sizeof(dibs->gid)); + memcpy(&dibs->gid, &cmd.response.gid, sizeof(cmd.response.gid)); out: return ret; } @@ -563,10 +565,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - ret = ism_read_local_gid(ism); - if (ret) - goto unreg_ieq; - if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID)) /* hardware is V2 capable */ ism_v2_capable = true; @@ -588,8 +586,6 @@ static int ism_dev_init(struct ism_dev *ism) query_info(ism); return 0; -unreg_ieq: - unregister_ieq(ism); unreg_sba: unregister_sba(ism); free_irq: @@ -672,6 +668,11 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (ret) goto err_dibs; + /* after ism_dev_init() we can call ism function to set gid */ + ret = ism_read_local_gid(dibs); + if (ret) + goto err_ism; + dibs->dev.parent = &pdev->dev; zdev = to_zpci(pdev); @@ -841,18 +842,6 @@ static int smcd_supports_v2(void) return ism_v2_capable; } -static u64 ism_get_local_gid(struct ism_dev *ism) -{ - return ism->local_gid; -} - -static void smcd_get_local_gid(struct smcd_dev *smcd, - struct smcd_gid *smcd_gid) -{ - smcd_gid->gid = ism_get_local_gid(smcd->priv); - smcd_gid->gid_ext = 0; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, @@ -864,7 +853,6 @@ static const struct smcd_ops ism_smcd_ops = { .signal_event = smcd_signal_ieq, .move_data = smcd_move, .supports_v2 = smcd_supports_v2, - .get_local_gid = smcd_get_local_gid, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 793c6e1ece0f..904f37505c27 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -10,6 +10,8 @@ #define _DIBS_H #include +#include + /* DIBS - Direct Internal Buffer Sharing - concept * ----------------------------------------------- * In the case of multiple system sharing the same hardware, dibs fabrics can @@ -138,6 +140,7 @@ struct dibs_dev { struct device dev; /* To be filled by device driver, before calling dibs_dev_add(): */ const struct dibs_dev_ops *ops; + uuid_t gid; /* priv pointer for device driver */ void *drv_priv; diff --git a/include/linux/ism.h b/include/linux/ism.h index 84f1afb3dded..a926dd61b5a1 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -42,7 +42,6 @@ struct ism_dev { struct ism_eq *ieq; dma_addr_t ieq_dma_addr; - u64 local_gid; int ieq_idx; struct ism_client *subs[MAX_CLIENTS]; diff --git a/include/net/smc.h b/include/net/smc.h index 05faac83371e..9cb8385bbc6e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -62,7 +62,6 @@ struct smcd_ops { bool sf, unsigned int offset, void *data, unsigned int size); int (*supports_v2)(void); - void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); /* optional operations */ int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 09745baa1017..157aace169d4 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -916,7 +916,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) /* add SMC-D specifics */ if (ini->ism_dev[0]) { smcd = ini->ism_dev[0]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); pclc_smcd->ism.gid = htonll(smcd_gid.gid); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); @@ -966,7 +966,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { smcd = ini->ism_dev[i]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); gidchids[entry].gid = htonll(smcd_gid.gid); @@ -1059,7 +1059,7 @@ smcd_clc_prep_confirm_accept(struct smc_connection *conn, /* SMC-D specific settings */ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); clc->hdr.typev1 = SMC_TYPE_D; clc->d0.gid = htonll(smcd_gid.gid); clc->d0.token = htonll(conn->rmb_desc->token); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 42ab0795d563..be0c2da83d2b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -555,7 +555,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 8ed2f6689b01..bf0beaa23bdb 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -175,7 +175,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid.gid; dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); dinfo.my_gid = smcd_gid.gid; dinfo.my_gid_ext = smcd_gid.gid_ext; dinfo.token = conn->rmb_desc->token; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 04699951d03f..139e99da2c9f 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -96,4 +96,26 @@ static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); } +static inline void copy_to_smcdgid(struct smcd_gid *sgid, uuid_t *dibs_gid) +{ + __be64 temp; + + memcpy(&temp, dibs_gid, sizeof(sgid->gid)); + sgid->gid = ntohll(temp); + memcpy(&temp, (uint8_t *)dibs_gid + sizeof(sgid->gid), + sizeof(sgid->gid_ext)); + sgid->gid_ext = ntohll(temp); +} + +static inline void copy_to_dibsgid(uuid_t *dibs_gid, struct smcd_gid *sgid) +{ + __be64 temp; + + temp = htonll(sgid->gid); + memcpy(dibs_gid, &temp, sizeof(sgid->gid)); + temp = htonll(sgid->gid_ext); + memcpy((uint8_t *)dibs_gid + sizeof(sgid->gid), &temp, + sizeof(sgid->gid_ext)); +} + #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 262d0d0df4d0..454d9d6a6e8f 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -13,6 +13,7 @@ #include #include +#include #include #include "smc_cdc.h" @@ -25,25 +26,14 @@ static struct smc_lo_dev *lo_dev; -static void smc_lo_generate_ids(struct smc_lo_dev *ldev) -{ - struct smcd_gid *lgid = &ldev->local_gid; - uuid_t uuid; - - uuid_gen(&uuid); - memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); - memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), - sizeof(lgid->gid_ext)); -} - static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, u32 vid_valid, u32 vid) { - struct smc_lo_dev *ldev = smcd->priv; + uuid_t temp; + copy_to_dibsgid(&temp, rgid); /* rgid should be the same as lgid */ - if (!ldev || rgid->gid != ldev->local_gid.gid || - rgid->gid_ext != ldev->local_gid.gid_ext) + if (!uuid_equal(&temp, &smcd->dibs->gid)) return -ENETUNREACH; return 0; } @@ -245,15 +235,6 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, return 0; } -static void smc_lo_get_local_gid(struct smcd_dev *smcd, - struct smcd_gid *smcd_gid) -{ - struct smc_lo_dev *ldev = smcd->priv; - - smcd_gid->gid = ldev->local_gid.gid; - smcd_gid->gid_ext = ldev->local_gid.gid_ext; -} - static const struct smcd_ops lo_ops = { .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, @@ -267,7 +248,6 @@ static const struct smcd_ops lo_ops = { .reset_vlan_required = NULL, .signal_event = NULL, .move_data = smc_lo_move_data, - .get_local_gid = smc_lo_get_local_gid, }; const struct smcd_ops *smc_lo_get_smcd_ops(void) @@ -277,7 +257,6 @@ const struct smcd_ops *smc_lo_get_smcd_ops(void) static void smc_lo_dev_init(struct smc_lo_dev *ldev) { - smc_lo_generate_ids(ldev); rwlock_init(&ldev->dmb_ht_lock); hash_init(ldev->dmb_ht); atomic_set(&ldev->dmb_cnt, 0); diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h index a033bf10890a..33bb96ec8b77 100644 --- a/net/smc/smc_loopback.h +++ b/net/smc/smc_loopback.h @@ -32,7 +32,6 @@ struct smc_lo_dmb_node { struct smc_lo_dev { struct smcd_dev *smcd; - struct smcd_gid local_gid; atomic_t dmb_cnt; rwlock_t dmb_ht_lock; DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); -- cgit v1.2.3 From 92a0f7bb081dde6e88368816b8ba51352ddabb1d Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:57 +0200 Subject: dibs: Move vlan support to dibs_dev_ops It can be debated how much benefit definition of vlan ids for dibs devices brings, as the dmbs are accessible only by a single peer anyhow. But ism provides vlan support and smcd exploits it, so move it to dibs layer as an optional feature. smcd_loopback simply ignores all vlan settings, do the same in dibs_loopback. SMC-D and ISM have a method to use the invalid VLAN ID 1FFF (ISM_RESERVED_VLANID), to indicate that both communication peers support routable SMC-Dv2. Tolerate it in dibs, but move it to SMC only. Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-12-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/s390/net/ism_drv.c | 47 ++++++---------------------------------------- include/linux/dibs.h | 19 +++++++++++++++++++ include/net/smc.h | 5 ----- net/smc/smc_ism.c | 14 +++++++++----- net/smc/smc_loopback.c | 5 ----- 5 files changed, 34 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index e58c55fb03c2..ed4c28ca355b 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -36,7 +36,6 @@ static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ /* a list for fast mapping */ static u8 max_client; static DEFINE_MUTEX(clients_lock); -static bool ism_v2_capable; struct ism_dev_list { struct list_head list; struct mutex mutex; /* protects ism device list */ @@ -409,8 +408,9 @@ out: } EXPORT_SYMBOL_GPL(ism_unregister_dmb); -static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) +static int ism_add_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { + struct ism_dev *ism = dibs->drv_priv; union ism_set_vlan_id cmd; memset(&cmd, 0, sizeof(cmd)); @@ -422,8 +422,9 @@ static int ism_add_vlan_id(struct ism_dev *ism, u64 vlan_id) return ism_cmd(ism, &cmd); } -static int ism_del_vlan_id(struct ism_dev *ism, u64 vlan_id) +static int ism_del_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { + struct ism_dev *ism = dibs->drv_priv; union ism_set_vlan_id cmd; memset(&cmd, 0, sizeof(cmd)); @@ -536,6 +537,8 @@ static irqreturn_t ism_handle_irq(int irq, void *data) static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, + .add_vlan_id = ism_add_vlan_id, + .del_vlan_id = ism_del_vlan_id, }; static int ism_dev_init(struct ism_dev *ism) @@ -565,12 +568,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - if (!ism_add_vlan_id(ism, ISM_RESERVED_VLANID)) - /* hardware is V2 capable */ - ism_v2_capable = true; - else - ism_v2_capable = false; - mutex_lock(&ism_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { @@ -611,8 +608,6 @@ static void ism_dev_exit(struct ism_dev *ism) mutex_lock(&ism_dev_list.mutex); - if (ism_v2_capable) - ism_del_vlan_id(ism, ISM_RESERVED_VLANID); unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); @@ -786,26 +781,6 @@ static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); } -static int smcd_add_vlan_id(struct smcd_dev *smcd, u64 vlan_id) -{ - return ism_add_vlan_id(smcd->priv, vlan_id); -} - -static int smcd_del_vlan_id(struct smcd_dev *smcd, u64 vlan_id) -{ - return ism_del_vlan_id(smcd->priv, vlan_id); -} - -static int smcd_set_vlan_required(struct smcd_dev *smcd) -{ - return ism_cmd_simple(smcd->priv, ISM_SET_VLAN); -} - -static int smcd_reset_vlan_required(struct smcd_dev *smcd) -{ - return ism_cmd_simple(smcd->priv, ISM_RESET_VLAN); -} - static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, u32 event_code, u64 info) { @@ -837,22 +812,12 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size); } -static int smcd_supports_v2(void) -{ - return ism_v2_capable; -} - static const struct smcd_ops ism_smcd_ops = { .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, - .add_vlan_id = smcd_add_vlan_id, - .del_vlan_id = smcd_del_vlan_id, - .set_vlan_required = smcd_set_vlan_required, - .reset_vlan_required = smcd_reset_vlan_required, .signal_event = smcd_signal_ieq, .move_data = smcd_move, - .supports_v2 = smcd_supports_v2, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 904f37505c27..166148fb8d76 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -133,6 +133,25 @@ struct dibs_dev_ops { * Return: 2 byte dibs fabric id */ u16 (*get_fabric_id)(struct dibs_dev *dev); + /** + * add_vlan_id() - add dibs device to vlan (optional, deprecated) + * @dev: dibs device + * @vlan_id: vlan id + * + * In order to write into a vlan-tagged dmb, the remote device needs + * to belong to the this vlan. A device can belong to more than 1 vlan. + * Any device can access an untagged dmb. + * Deprecated, only supported for backwards compatibility. + * Return: zero on success + */ + int (*add_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * del_vlan_id() - remove dibs device from vlan (optional, deprecated) + * @dev: dibs device + * @vlan_id: vlan id + * Return: zero on success + */ + int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); }; struct dibs_dev { diff --git a/include/net/smc.h b/include/net/smc.h index 9cb8385bbc6e..51b4aefc106a 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -61,13 +61,8 @@ struct smcd_ops { int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); - int (*supports_v2)(void); /* optional operations */ - int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*set_vlan_required)(struct smcd_dev *dev); - int (*reset_vlan_required)(struct smcd_dev *dev); int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, u32 trigger_irq, u32 event_code, u64 info); int (*support_dmb_nocopy)(struct smcd_dev *dev); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 6a6e7c9641e8..5118441bed18 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -140,7 +140,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->add_vlan_id) + if (!smcd->dibs->ops->add_vlan_id) return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ @@ -163,7 +163,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ - if (smcd->ops->add_vlan_id(smcd, vlanid)) { + if (smcd->dibs->ops->add_vlan_id(smcd->dibs, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; @@ -187,7 +187,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->del_vlan_id) + if (!smcd->dibs->ops->del_vlan_id) return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); @@ -205,7 +205,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) } /* Found and the last reference just gone */ - if (smcd->ops->del_vlan_id(smcd, vlanid)) + if (smcd->dibs->ops->del_vlan_id(smcd->dibs, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); @@ -539,8 +539,12 @@ static void smcd_register_dev(struct dibs_dev *dibs) if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); - if (smc_ism_is_loopback(dibs) || smcd->ops->supports_v2()) + if (smc_ism_is_loopback(dibs) || + (dibs->ops->add_vlan_id && + !dibs->ops->add_vlan_id(dibs, ISM_RESERVED_VLANID))) { smc_ism_set_v2_capable(); + } + mutex_lock(&smcd_dev_list.mutex); /* sort list: * - devices without pnetid before devices with pnetid; diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 454d9d6a6e8f..982a19430313 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -20,7 +20,6 @@ #include "smc_ism.h" #include "smc_loopback.h" -#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */ #define SMC_LO_SUPPORT_NOCOPY 0x1 #define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) @@ -242,10 +241,6 @@ static const struct smcd_ops lo_ops = { .support_dmb_nocopy = smc_lo_support_dmb_nocopy, .attach_dmb = smc_lo_attach_dmb, .detach_dmb = smc_lo_detach_dmb, - .add_vlan_id = NULL, - .del_vlan_id = NULL, - .set_vlan_required = NULL, - .reset_vlan_required = NULL, .signal_event = NULL, .move_data = smc_lo_move_data, }; -- cgit v1.2.3 From 719c3b67bb7ea95bb8158b03c75641c8fc8f94a0 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:58 +0200 Subject: dibs: Move query_remote_gid() to dibs_dev_ops Provide the dibs_dev_ops->query_remote_gid() in ism and dibs_loopback dibs_devices. And call it in smc dibs_client. Reviewed-by: Julian Ruess Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-13-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 10 ++++++++++ drivers/s390/net/ism_drv.c | 41 ++++++++++++++++++----------------------- include/linux/dibs.h | 14 ++++++++++++++ include/net/smc.h | 2 -- net/smc/smc_ism.c | 8 ++++++-- net/smc/smc_loopback.c | 13 ------------- 6 files changed, 48 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index d7e6fa5e90f3..6b53e626a6d1 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -24,8 +24,18 @@ static u16 dibs_lo_get_fabric_id(struct dibs_dev *dibs) return DIBS_LOOPBACK_FABRIC; } +static int dibs_lo_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, + u32 vid_valid, u32 vid) +{ + /* rgid should be the same as lgid */ + if (!uuid_equal(rgid, &dibs->gid)) + return -ENETUNREACH; + return 0; +} + static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, + .query_remote_gid = dibs_lo_query_rgid, }; static int dibs_lo_dev_probe(void) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index ed4c28ca355b..121b3a2be760 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -291,6 +291,23 @@ out: return ret; } +static int ism_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, + u32 vid_valid, u32 vid) +{ + struct ism_dev *ism = dibs->drv_priv; + union ism_query_rgid cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_QUERY_RGID; + cmd.request.hdr.len = sizeof(cmd.request); + + memcpy(&cmd.request.rgid, rgid, sizeof(cmd.request.rgid)); + cmd.request.vlan_valid = vid_valid; + cmd.request.vlan_id = vid; + + return ism_cmd(ism, &cmd); +} + static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) { clear_bit(dmb->sba_idx, ism->sba_bitmap); @@ -537,6 +554,7 @@ static irqreturn_t ism_handle_irq(int irq, void *data) static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, + .query_remote_gid = ism_query_rgid, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, }; @@ -748,28 +766,6 @@ module_exit(ism_exit); /*************************** SMC-D Implementation *****************************/ #if IS_ENABLED(CONFIG_SMC) -static int ism_query_rgid(struct ism_dev *ism, u64 rgid, u32 vid_valid, - u32 vid) -{ - union ism_query_rgid cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_QUERY_RGID; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.vlan_valid = vid_valid; - cmd.request.vlan_id = vid; - - return ism_cmd(ism, &cmd); -} - -static int smcd_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 vid_valid, u32 vid) -{ - return ism_query_rgid(smcd->priv, rgid->gid, vid_valid, vid); -} - static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, void *client) { @@ -813,7 +809,6 @@ static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, } static const struct smcd_ops ism_smcd_ops = { - .query_remote_gid = smcd_query_rgid, .register_dmb = smcd_register_dmb, .unregister_dmb = smcd_unregister_dmb, .signal_event = smcd_signal_ieq, diff --git a/include/linux/dibs.h b/include/linux/dibs.h index 166148fb8d76..c75a40fe3039 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -133,6 +133,20 @@ struct dibs_dev_ops { * Return: 2 byte dibs fabric id */ u16 (*get_fabric_id)(struct dibs_dev *dev); + /** + * query_remote_gid() + * @dev: local dibs device + * @rgid: gid of remote dibs device + * @vid_valid: if zero, vid will be ignored; + * deprecated, ignored if device does not support vlan + * @vid: VLAN id; deprecated, ignored if device does not support vlan + * + * Query whether a remote dibs device is reachable via this local device + * and this vlan id. + * Return: 0 if remote gid is reachable. + */ + int (*query_remote_gid)(struct dibs_dev *dev, const uuid_t *rgid, + u32 vid_valid, u32 vid); /** * add_vlan_id() - add dibs device to vlan (optional, deprecated) * @dev: dibs device diff --git a/include/net/smc.h b/include/net/smc.h index 51b4aefc106a..5bd135fb4d49 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -53,8 +53,6 @@ struct smcd_gid { }; struct smcd_ops { - int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 vid_valid, u32 vid); int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, void *client); int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 5118441bed18..d20d00b46825 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -77,8 +77,12 @@ static void smc_ism_create_system_eid(void) int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { - return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, - vlan_id); + struct dibs_dev *dibs = smcd->dibs; + uuid_t ism_rgid; + + copy_to_dibsgid(&ism_rgid, peer_gid); + return dibs->ops->query_remote_gid(dibs, &ism_rgid, vlan_id ? 1 : 0, + vlan_id); } void smc_ism_get_system_eid(u8 **eid) diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c index 982a19430313..52cba01cb209 100644 --- a/net/smc/smc_loopback.c +++ b/net/smc/smc_loopback.c @@ -25,18 +25,6 @@ static struct smc_lo_dev *lo_dev; -static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 vid_valid, u32 vid) -{ - uuid_t temp; - - copy_to_dibsgid(&temp, rgid); - /* rgid should be the same as lgid */ - if (!uuid_equal(&temp, &smcd->dibs->gid)) - return -ENETUNREACH; - return 0; -} - static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, void *client_priv) { @@ -235,7 +223,6 @@ static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, } static const struct smcd_ops lo_ops = { - .query_remote_gid = smc_lo_query_rgid, .register_dmb = smc_lo_register_dmb, .unregister_dmb = smc_lo_unregister_dmb, .support_dmb_nocopy = smc_lo_support_dmb_nocopy, -- cgit v1.2.3 From cc21191b584c6f7836b0f10774f8278b7cbfba10 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 18 Sep 2025 13:04:59 +0200 Subject: dibs: Move data path to dibs layer Use struct dibs_dmb instead of struct smc_dmb and move the corresponding client tables to dibs_dev. Leave driver specific implementation details like sba in the device drivers. Register and unregister dmbs via dibs_dev_ops. A dmb is dedicated to a single client, but a dibs device can have dmbs for more than one client. Trigger dibs clients via dibs_client_ops->handle_irq(), when data is received into a dmb. For dibs_loopback replace scheduling an smcd receive tasklet with calling dibs_client_ops->handle_irq(). For loopback devices attach_dmb(), detach_dmb() and move_data() need to access the dmb tables, so move those to dibs_dev_ops in this patch as well. Remove remaining definitions of smc_loopback as they are no longer required, now that everything is in dibs_loopback. Note that struct ism_client and struct ism_dev are still required in smc until a follow-on patch moves event handling to dibs. (Loopback does not use events). Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-14-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- drivers/dibs/dibs_loopback.c | 257 +++++++++++++++++++++++++++++++++++++ drivers/dibs/dibs_loopback.h | 19 +++ drivers/dibs/dibs_main.c | 56 ++++++++- drivers/s390/net/ism_drv.c | 121 ++++++++---------- include/linux/dibs.h | 177 ++++++++++++++++++++++++++ include/linux/ism.h | 23 ---- include/net/smc.h | 22 ---- net/smc/Makefile | 1 - net/smc/smc_ism.c | 70 +++++------ net/smc/smc_ism.h | 4 +- net/smc/smc_loopback.c | 294 ------------------------------------------- net/smc/smc_loopback.h | 47 ------- 12 files changed, 591 insertions(+), 500 deletions(-) delete mode 100644 net/smc/smc_loopback.c delete mode 100644 net/smc/smc_loopback.h (limited to 'include/linux') diff --git a/drivers/dibs/dibs_loopback.c b/drivers/dibs/dibs_loopback.c index 6b53e626a6d1..b3fd0f8100d4 100644 --- a/drivers/dibs/dibs_loopback.c +++ b/drivers/dibs/dibs_loopback.c @@ -9,12 +9,18 @@ * */ +#include +#include #include #include +#include #include #include "dibs_loopback.h" +#define DIBS_LO_SUPPORT_NOCOPY 0x1 +#define DIBS_DMA_ADDR_INVALID (~(dma_addr_t)0) + static const char dibs_lo_dev_name[] = "lo"; /* global loopback device */ static struct dibs_lo_dev *lo_dev; @@ -33,11 +39,259 @@ static int dibs_lo_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, return 0; } +static int dibs_lo_max_dmbs(void) +{ + return DIBS_LO_MAX_DMBS; +} + +static int dibs_lo_register_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb, + struct dibs_client *client) +{ + struct dibs_lo_dmb_node *dmb_node, *tmp_node; + struct dibs_lo_dev *ldev; + unsigned long flags; + int sba_idx, rc; + + ldev = dibs->drv_priv; + sba_idx = dmb->idx; + /* check space for new dmb */ + for_each_clear_bit(sba_idx, ldev->sba_idx_mask, DIBS_LO_MAX_DMBS) { + if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) + break; + } + if (sba_idx == DIBS_LO_MAX_DMBS) + return -ENOSPC; + + dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); + if (!dmb_node) { + rc = -ENOMEM; + goto err_bit; + } + + dmb_node->sba_idx = sba_idx; + dmb_node->len = dmb->dmb_len; + dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!dmb_node->cpu_addr) { + rc = -ENOMEM; + goto err_node; + } + dmb_node->dma_addr = DIBS_DMA_ADDR_INVALID; + refcount_set(&dmb_node->refcnt, 1); + +again: + /* add new dmb into hash table */ + get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); + write_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { + if (tmp_node->token == dmb_node->token) { + write_unlock_bh(&ldev->dmb_ht_lock); + goto again; + } + } + hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); + write_unlock_bh(&ldev->dmb_ht_lock); + atomic_inc(&ldev->dmb_cnt); + + dmb->idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[sba_idx] = client->id; + spin_unlock_irqrestore(&dibs->lock, flags); + + return 0; + +err_node: + kfree(dmb_node); +err_bit: + clear_bit(sba_idx, ldev->sba_idx_mask); + return rc; +} + +static void __dibs_lo_unregister_dmb(struct dibs_lo_dev *ldev, + struct dibs_lo_dmb_node *dmb_node) +{ + /* remove dmb from hash table */ + write_lock_bh(&ldev->dmb_ht_lock); + hash_del(&dmb_node->list); + write_unlock_bh(&ldev->dmb_ht_lock); + + clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); + kfree(dmb_node->cpu_addr); + kfree(dmb_node); + + if (atomic_dec_and_test(&ldev->dmb_cnt)) + wake_up(&ldev->ldev_release); +} + +static int dibs_lo_unregister_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + unsigned long flags; + + ldev = dibs->drv_priv; + + /* find dmb from hash table */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + read_unlock_bh(&ldev->dmb_ht_lock); + if (!dmb_node) + return -EINVAL; + + if (refcount_dec_and_test(&dmb_node->refcnt)) { + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb_node->sba_idx] = NO_DIBS_CLIENT; + spin_unlock_irqrestore(&dibs->lock, flags); + + __dibs_lo_unregister_dmb(ldev, dmb_node); + } + return 0; +} + +static int dibs_lo_support_dmb_nocopy(struct dibs_dev *dibs) +{ + return DIBS_LO_SUPPORT_NOCOPY; +} + +static int dibs_lo_attach_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + + ldev = dibs->drv_priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!refcount_inc_not_zero(&dmb_node->refcnt)) + /* the dmb is being unregistered, but has + * not been removed from the hash table. + */ + return -EINVAL; + + /* provide dmb information */ + dmb->idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + return 0; +} + +static int dibs_lo_detach_dmb(struct dibs_dev *dibs, u64 token) +{ + struct dibs_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + + ldev = dibs->drv_priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { + if (tmp_node->token == token) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (refcount_dec_and_test(&dmb_node->refcnt)) + __dibs_lo_unregister_dmb(ldev, dmb_node); + return 0; +} + +static int dibs_lo_move_data(struct dibs_dev *dibs, u64 dmb_tok, + unsigned int idx, bool sf, unsigned int offset, + void *data, unsigned int size) +{ + struct dibs_lo_dmb_node *rmb_node = NULL, *tmp_node; + struct dibs_lo_dev *ldev; + u16 s_mask; + u8 client_id; + u32 sba_idx; + + ldev = dibs->drv_priv; + + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { + if (tmp_node->token == dmb_tok) { + rmb_node = tmp_node; + break; + } + } + if (!rmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + memcpy((char *)rmb_node->cpu_addr + offset, data, size); + sba_idx = rmb_node->sba_idx; + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!sf) + return 0; + + spin_lock(&dibs->lock); + client_id = dibs->dmb_clientid_arr[sba_idx]; + s_mask = ror16(0x1000, idx); + if (likely(client_id != NO_DIBS_CLIENT && dibs->subs[client_id])) + dibs->subs[client_id]->ops->handle_irq(dibs, sba_idx, s_mask); + spin_unlock(&dibs->lock); + + return 0; +} + static const struct dibs_dev_ops dibs_lo_ops = { .get_fabric_id = dibs_lo_get_fabric_id, .query_remote_gid = dibs_lo_query_rgid, + .max_dmbs = dibs_lo_max_dmbs, + .register_dmb = dibs_lo_register_dmb, + .unregister_dmb = dibs_lo_unregister_dmb, + .move_data = dibs_lo_move_data, + .support_mmapped_rdmb = dibs_lo_support_dmb_nocopy, + .attach_dmb = dibs_lo_attach_dmb, + .detach_dmb = dibs_lo_detach_dmb, }; +static void dibs_lo_dev_init(struct dibs_lo_dev *ldev) +{ + rwlock_init(&ldev->dmb_ht_lock); + hash_init(ldev->dmb_ht); + atomic_set(&ldev->dmb_cnt, 0); + init_waitqueue_head(&ldev->ldev_release); +} + +static void dibs_lo_dev_exit(struct dibs_lo_dev *ldev) +{ + if (atomic_read(&ldev->dmb_cnt)) + wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); +} + static int dibs_lo_dev_probe(void) { struct dibs_lo_dev *ldev; @@ -56,6 +310,7 @@ static int dibs_lo_dev_probe(void) ldev->dibs = dibs; dibs->drv_priv = ldev; + dibs_lo_dev_init(ldev); uuid_gen(&dibs->gid); dibs->ops = &dibs_lo_ops; @@ -69,6 +324,7 @@ static int dibs_lo_dev_probe(void) return 0; err_reg: + kfree(dibs->dmb_clientid_arr); /* pairs with dibs_dev_alloc() */ put_device(&dibs->dev); kfree(ldev); @@ -82,6 +338,7 @@ static void dibs_lo_dev_remove(void) return; dibs_dev_del(lo_dev->dibs); + dibs_lo_dev_exit(lo_dev); /* pairs with dibs_dev_alloc() */ put_device(&lo_dev->dibs->dev); kfree(lo_dev); diff --git a/drivers/dibs/dibs_loopback.h b/drivers/dibs/dibs_loopback.h index fd03b6333a24..0664f6a8e662 100644 --- a/drivers/dibs/dibs_loopback.h +++ b/drivers/dibs/dibs_loopback.h @@ -13,13 +13,32 @@ #define _DIBS_LOOPBACK_H #include +#include +#include #include #include #if IS_ENABLED(CONFIG_DIBS_LO) +#define DIBS_LO_DMBS_HASH_BITS 12 +#define DIBS_LO_MAX_DMBS 5000 + +struct dibs_lo_dmb_node { + struct hlist_node list; + u64 token; + u32 len; + u32 sba_idx; + void *cpu_addr; + dma_addr_t dma_addr; + refcount_t refcnt; +}; struct dibs_lo_dev { struct dibs_dev *dibs; + atomic_t dmb_cnt; + rwlock_t dmb_ht_lock; + DECLARE_BITMAP(sba_idx_mask, DIBS_LO_MAX_DMBS); + DECLARE_HASHTABLE(dmb_ht, DIBS_LO_DMBS_HASH_BITS); + wait_queue_head_t ldev_release; }; int dibs_loopback_init(void); diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index f20ed0594a51..aacb3ea7825a 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -36,6 +36,16 @@ static struct dibs_dev_list dibs_dev_list = { .mutex = __MUTEX_INITIALIZER(dibs_dev_list.mutex), }; +static void dibs_setup_forwarding(struct dibs_client *client, + struct dibs_dev *dibs) +{ + unsigned long flags; + + spin_lock_irqsave(&dibs->lock, flags); + dibs->subs[client->id] = client; + spin_unlock_irqrestore(&dibs->lock, flags); +} + int dibs_register_client(struct dibs_client *client) { struct dibs_dev *dibs; @@ -60,6 +70,7 @@ int dibs_register_client(struct dibs_client *client) list_for_each_entry(dibs, &dibs_dev_list.list, list) { dibs->priv[i] = NULL; client->ops->add_dev(dibs); + dibs_setup_forwarding(client, dibs); } } mutex_unlock(&dibs_dev_list.mutex); @@ -71,10 +82,25 @@ EXPORT_SYMBOL_GPL(dibs_register_client); int dibs_unregister_client(struct dibs_client *client) { struct dibs_dev *dibs; + unsigned long flags; + int max_dmbs; int rc = 0; mutex_lock(&dibs_dev_list.mutex); list_for_each_entry(dibs, &dibs_dev_list.list, list) { + spin_lock_irqsave(&dibs->lock, flags); + max_dmbs = dibs->ops->max_dmbs(); + for (int i = 0; i < max_dmbs; ++i) { + if (dibs->dmb_clientid_arr[i] == client->id) { + WARN(1, "%s: attempt to unregister '%s' with registered dmb(s)\n", + __func__, client->name); + rc = -EBUSY; + goto err_reg_dmb; + } + } + /* Stop forwarding IRQs */ + dibs->subs[client->id] = NULL; + spin_unlock_irqrestore(&dibs->lock, flags); clients[client->id]->ops->del_dev(dibs); dibs->priv[client->id] = NULL; } @@ -87,6 +113,11 @@ int dibs_unregister_client(struct dibs_client *client) mutex_unlock(&dibs_dev_list.mutex); return rc; + +err_reg_dmb: + spin_unlock_irqrestore(&dibs->lock, flags); + mutex_unlock(&dibs_dev_list.mutex); + return rc; } EXPORT_SYMBOL_GPL(dibs_unregister_client); @@ -150,11 +181,19 @@ static const struct attribute_group dibs_dev_attr_group = { int dibs_dev_add(struct dibs_dev *dibs) { + int max_dmbs; int i, ret; + max_dmbs = dibs->ops->max_dmbs(); + spin_lock_init(&dibs->lock); + dibs->dmb_clientid_arr = kzalloc(max_dmbs, GFP_KERNEL); + if (!dibs->dmb_clientid_arr) + return -ENOMEM; + memset(dibs->dmb_clientid_arr, NO_DIBS_CLIENT, max_dmbs); + ret = device_add(&dibs->dev); if (ret) - return ret; + goto free_client_arr; ret = sysfs_create_group(&dibs->dev.kobj, &dibs_dev_attr_group); if (ret) { @@ -164,8 +203,10 @@ int dibs_dev_add(struct dibs_dev *dibs) mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { - if (clients[i]) + if (clients[i]) { clients[i]->ops->add_dev(dibs); + dibs_setup_forwarding(clients[i], dibs); + } } mutex_unlock(&clients_lock); list_add(&dibs->list, &dibs_dev_list.list); @@ -175,6 +216,8 @@ int dibs_dev_add(struct dibs_dev *dibs) err_device_del: device_del(&dibs->dev); +free_client_arr: + kfree(dibs->dmb_clientid_arr); return ret; } @@ -182,8 +225,16 @@ EXPORT_SYMBOL_GPL(dibs_dev_add); void dibs_dev_del(struct dibs_dev *dibs) { + unsigned long flags; int i; + sysfs_remove_group(&dibs->dev.kobj, &dibs_dev_attr_group); + + spin_lock_irqsave(&dibs->lock, flags); + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) + dibs->subs[i] = NULL; + spin_unlock_irqrestore(&dibs->lock, flags); + mutex_lock(&dibs_dev_list.mutex); mutex_lock(&clients_lock); for (i = 0; i < max_client; ++i) { @@ -195,6 +246,7 @@ void dibs_dev_del(struct dibs_dev *dibs) mutex_unlock(&dibs_dev_list.mutex); device_del(&dibs->dev); + kfree(dibs->dmb_clientid_arr); } EXPORT_SYMBOL_GPL(dibs_dev_del); diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 121b3a2be760..346d1ea8650b 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -98,14 +98,6 @@ int ism_unregister_client(struct ism_client *client) spin_lock_irqsave(&ism->lock, flags); /* Stop forwarding IRQs and events */ ism->subs[client->id] = NULL; - for (int i = 0; i < ISM_NR_DMBS; ++i) { - if (ism->sba_client_arr[i] == client->id) { - WARN(1, "%s: attempt to unregister '%s' with registered dmb(s)\n", - __func__, client->name); - rc = -EBUSY; - goto err_reg_dmb; - } - } spin_unlock_irqrestore(&ism->lock, flags); } mutex_unlock(&ism_dev_list.mutex); @@ -116,11 +108,6 @@ int ism_unregister_client(struct ism_client *client) max_client--; mutex_unlock(&clients_lock); return rc; - -err_reg_dmb: - spin_unlock_irqrestore(&ism->lock, flags); - mutex_unlock(&ism_dev_list.mutex); - return rc; } EXPORT_SYMBOL_GPL(ism_unregister_client); @@ -308,15 +295,20 @@ static int ism_query_rgid(struct dibs_dev *dibs, const uuid_t *rgid, return ism_cmd(ism, &cmd); } -static void ism_free_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_max_dmbs(void) +{ + return ISM_NR_DMBS; +} + +static void ism_free_dmb(struct ism_dev *ism, struct dibs_dmb *dmb) { - clear_bit(dmb->sba_idx, ism->sba_bitmap); + clear_bit(dmb->idx, ism->sba_bitmap); dma_unmap_page(&ism->pdev->dev, dmb->dma_addr, dmb->dmb_len, DMA_FROM_DEVICE); folio_put(virt_to_folio(dmb->cpu_addr)); } -static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_alloc_dmb(struct ism_dev *ism, struct dibs_dmb *dmb) { struct folio *folio; unsigned long bit; @@ -325,16 +317,16 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) if (PAGE_ALIGN(dmb->dmb_len) > dma_get_max_seg_size(&ism->pdev->dev)) return -EINVAL; - if (!dmb->sba_idx) { + if (!dmb->idx) { bit = find_next_zero_bit(ism->sba_bitmap, ISM_NR_DMBS, ISM_DMB_BIT_OFFSET); if (bit == ISM_NR_DMBS) return -ENOSPC; - dmb->sba_idx = bit; + dmb->idx = bit; } - if (dmb->sba_idx < ISM_DMB_BIT_OFFSET || - test_and_set_bit(dmb->sba_idx, ism->sba_bitmap)) + if (dmb->idx < ISM_DMB_BIT_OFFSET || + test_and_set_bit(dmb->idx, ism->sba_bitmap)) return -EINVAL; folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | @@ -359,13 +351,14 @@ static int ism_alloc_dmb(struct ism_dev *ism, struct ism_dmb *dmb) out_free: kfree(dmb->cpu_addr); out_bit: - clear_bit(dmb->sba_idx, ism->sba_bitmap); + clear_bit(dmb->idx, ism->sba_bitmap); return rc; } -int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, - struct ism_client *client) +static int ism_register_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb, + struct dibs_client *client) { + struct ism_dev *ism = dibs->drv_priv; union ism_reg_dmb cmd; unsigned long flags; int ret; @@ -380,10 +373,10 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, cmd.request.dmb = dmb->dma_addr; cmd.request.dmb_len = dmb->dmb_len; - cmd.request.sba_idx = dmb->sba_idx; + cmd.request.sba_idx = dmb->idx; cmd.request.vlan_valid = dmb->vlan_valid; cmd.request.vlan_id = dmb->vlan_id; - cmd.request.rgid = dmb->rgid; + memcpy(&cmd.request.rgid, &dmb->rgid, sizeof(u64)); ret = ism_cmd(ism, &cmd); if (ret) { @@ -391,16 +384,16 @@ int ism_register_dmb(struct ism_dev *ism, struct ism_dmb *dmb, goto out; } dmb->dmb_tok = cmd.response.dmb_tok; - spin_lock_irqsave(&ism->lock, flags); - ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = client->id; - spin_unlock_irqrestore(&ism->lock, flags); + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb->idx - ISM_DMB_BIT_OFFSET] = client->id; + spin_unlock_irqrestore(&dibs->lock, flags); out: return ret; } -EXPORT_SYMBOL_GPL(ism_register_dmb); -int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) +static int ism_unregister_dmb(struct dibs_dev *dibs, struct dibs_dmb *dmb) { + struct ism_dev *ism = dibs->drv_priv; union ism_unreg_dmb cmd; unsigned long flags; int ret; @@ -411,9 +404,9 @@ int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) cmd.request.dmb_tok = dmb->dmb_tok; - spin_lock_irqsave(&ism->lock, flags); - ism->sba_client_arr[dmb->sba_idx - ISM_DMB_BIT_OFFSET] = NO_CLIENT; - spin_unlock_irqrestore(&ism->lock, flags); + spin_lock_irqsave(&dibs->lock, flags); + dibs->dmb_clientid_arr[dmb->idx - ISM_DMB_BIT_OFFSET] = NO_DIBS_CLIENT; + spin_unlock_irqrestore(&dibs->lock, flags); ret = ism_cmd(ism, &cmd); if (ret && ret != ISM_ERROR) @@ -423,7 +416,6 @@ int ism_unregister_dmb(struct ism_dev *ism, struct ism_dmb *dmb) out: return ret; } -EXPORT_SYMBOL_GPL(ism_unregister_dmb); static int ism_add_vlan_id(struct dibs_dev *dibs, u64 vlan_id) { @@ -459,9 +451,11 @@ static unsigned int max_bytes(unsigned int start, unsigned int len, return min(boundary - (start & (boundary - 1)), len); } -int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, - unsigned int offset, void *data, unsigned int size) +static int ism_move(struct dibs_dev *dibs, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size) { + struct ism_dev *ism = dibs->drv_priv; unsigned int bytes; u64 dmb_req; int ret; @@ -482,7 +476,6 @@ int ism_move(struct ism_dev *ism, u64 dmb_tok, unsigned int idx, bool sf, return 0; } -EXPORT_SYMBOL_GPL(ism_move); static u16 ism_get_chid(struct dibs_dev *dibs) { @@ -518,14 +511,17 @@ static irqreturn_t ism_handle_irq(int irq, void *data) { struct ism_dev *ism = data; unsigned long bit, end; + struct dibs_dev *dibs; unsigned long *bv; u16 dmbemask; u8 client_id; + dibs = ism->dibs; + bv = (void *) &ism->sba->dmb_bits[ISM_DMB_WORD_OFFSET]; end = sizeof(ism->sba->dmb_bits) * BITS_PER_BYTE - ISM_DMB_BIT_OFFSET; - spin_lock(&ism->lock); + spin_lock(&dibs->lock); ism->sba->s = 0; barrier(); for (bit = 0;;) { @@ -537,10 +533,13 @@ static irqreturn_t ism_handle_irq(int irq, void *data) dmbemask = ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET]; ism->sba->dmbe_mask[bit + ISM_DMB_BIT_OFFSET] = 0; barrier(); - client_id = ism->sba_client_arr[bit]; - if (unlikely(client_id == NO_CLIENT || !ism->subs[client_id])) + client_id = dibs->dmb_clientid_arr[bit]; + if (unlikely(client_id == NO_DIBS_CLIENT || + !dibs->subs[client_id])) continue; - ism->subs[client_id]->handle_irq(ism, bit + ISM_DMB_BIT_OFFSET, dmbemask); + dibs->subs[client_id]->ops->handle_irq(dibs, + bit + ISM_DMB_BIT_OFFSET, + dmbemask); } if (ism->sba->e) { @@ -548,13 +547,17 @@ static irqreturn_t ism_handle_irq(int irq, void *data) barrier(); ism_handle_event(ism); } - spin_unlock(&ism->lock); + spin_unlock(&dibs->lock); return IRQ_HANDLED; } static const struct dibs_dev_ops ism_ops = { .get_fabric_id = ism_get_chid, .query_remote_gid = ism_query_rgid, + .max_dmbs = ism_max_dmbs, + .register_dmb = ism_register_dmb, + .unregister_dmb = ism_unregister_dmb, + .move_data = ism_move, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, }; @@ -568,15 +571,10 @@ static int ism_dev_init(struct ism_dev *ism) if (ret <= 0) goto out; - ism->sba_client_arr = kzalloc(ISM_NR_DMBS, GFP_KERNEL); - if (!ism->sba_client_arr) - goto free_vectors; - memset(ism->sba_client_arr, NO_CLIENT, ISM_NR_DMBS); - ret = request_irq(pci_irq_vector(pdev, 0), ism_handle_irq, 0, pci_name(pdev), ism); if (ret) - goto free_client_arr; + goto free_vectors; ret = register_sba(ism); if (ret) @@ -605,8 +603,6 @@ unreg_sba: unregister_sba(ism); free_irq: free_irq(pci_irq_vector(pdev, 0), ism); -free_client_arr: - kfree(ism->sba_client_arr); free_vectors: pci_free_irq_vectors(pdev); out: @@ -629,7 +625,6 @@ static void ism_dev_exit(struct ism_dev *ism) unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); - kfree(ism->sba_client_arr); pci_free_irq_vectors(pdev); list_del_init(&ism->list); mutex_unlock(&ism_dev_list.mutex); @@ -677,6 +672,9 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) dibs->drv_priv = ism; dibs->ops = &ism_ops; + /* enable ism device, but any interrupts and events will be ignored + * before dibs_dev_add() adds it to any clients. + */ ret = ism_dev_init(ism); if (ret) goto err_dibs; @@ -766,17 +764,6 @@ module_exit(ism_exit); /*************************** SMC-D Implementation *****************************/ #if IS_ENABLED(CONFIG_SMC) -static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, - void *client) -{ - return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, client); -} - -static int smcd_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - return ism_unregister_dmb(smcd->priv, (struct ism_dmb *)dmb); -} - static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, u32 event_code, u64 info) { @@ -801,18 +788,8 @@ static int smcd_signal_ieq(struct smcd_dev *smcd, struct smcd_gid *rgid, trigger_irq, event_code, info); } -static int smcd_move(struct smcd_dev *smcd, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size) -{ - return ism_move(smcd->priv, dmb_tok, idx, sf, offset, data, size); -} - static const struct smcd_ops ism_smcd_ops = { - .register_dmb = smcd_register_dmb, - .unregister_dmb = smcd_unregister_dmb, .signal_event = smcd_signal_ieq, - .move_data = smcd_move, }; const struct smcd_ops *ism_get_smcd_ops(void) diff --git a/include/linux/dibs.h b/include/linux/dibs.h index c75a40fe3039..be009c614205 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -36,12 +36,44 @@ * clients. */ +/* DMB - Direct Memory Buffer + * -------------------------- + * A dibs client provides a dmb as input buffer for a local receiving + * dibs device for exactly one (remote) sending dibs device. Only this + * sending device can send data into this dmb using move_data(). Sender + * and receiver can be the same device. A dmb belongs to exactly one client. + */ +struct dibs_dmb { + /* tok - Token for this dmb + * Used by remote and local devices and clients to address this dmb. + * Provided by dibs fabric. Unique per dibs fabric. + */ + u64 dmb_tok; + /* rgid - GID of designated remote sending device */ + uuid_t rgid; + /* cpu_addr - buffer address */ + void *cpu_addr; + /* len - buffer length */ + u32 dmb_len; + /* idx - Index of this DMB on this receiving device */ + u32 idx; + /* VLAN support (deprecated) + * In order to write into a vlan-tagged dmb, the remote device needs + * to belong to the this vlan + */ + u32 vlan_valid; + u32 vlan_id; + /* optional, used by device driver */ + dma_addr_t dma_addr; +}; + struct dibs_dev; /* DIBS client * ----------- */ #define MAX_DIBS_CLIENTS 8 +#define NO_DIBS_CLIENT 0xff /* All dibs clients have access to all dibs devices. * A dibs client provides the following functions to be called by dibs layer or * dibs device drivers: @@ -69,6 +101,22 @@ struct dibs_client_ops { * The device is no longer usable by this client after this call. */ void (*del_dev)(struct dibs_dev *dev); + /** + * handle_irq() - Handle signaling for a DMB + * @dev: device that owns the dmb + * @idx: Index of the dmb that got signalled + * @dmbemask: signaling mask of the dmb + * + * Handle signaling for a dmb that was registered by this client + * for this device. + * The dibs device can coalesce multiple signaling triggers into a + * single call of handle_irq(). dmbemask can be used to indicate + * different kinds of triggers. + * + * Context: Called in IRQ context by dibs device driver + */ + void (*handle_irq)(struct dibs_dev *dev, unsigned int idx, + u16 dmbemask); }; struct dibs_client { @@ -147,6 +195,77 @@ struct dibs_dev_ops { */ int (*query_remote_gid)(struct dibs_dev *dev, const uuid_t *rgid, u32 vid_valid, u32 vid); + /** + * max_dmbs() + * Return: Max number of DMBs that can be registered for this kind of + * dibs_dev + */ + int (*max_dmbs)(void); + /** + * register_dmb() - allocate and register a dmb + * @dev: dibs device + * @dmb: dmb struct to be registered + * @client: dibs client + * @vid: VLAN id; deprecated, ignored if device does not support vlan + * + * The following fields of dmb must provide valid input: + * @rgid: gid of remote user device + * @dmb_len: buffer length + * @idx: Optionally:requested idx (if non-zero) + * @vlan_valid: if zero, vlan_id will be ignored; + * deprecated, ignored if device does not support vlan + * @vlan_id: deprecated, ignored if device does not support vlan + * Upon return in addition the following fields will be valid: + * @dmb_tok: for usage by remote and local devices and clients + * @cpu_addr: allocated buffer + * @idx: dmb index, unique per dibs device + * @dma_addr: to be used by device driver,if applicable + * + * Allocate a dmb buffer and register it with this device and for this + * client. + * Return: zero on success + */ + int (*register_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb, + struct dibs_client *client); + /** + * unregister_dmb() - unregister and free a dmb + * @dev: dibs device + * @dmb: dmb struct to be unregistered + * The following fields of dmb must provide valid input: + * @dmb_tok + * @cpu_addr + * @idx + * + * Free dmb.cpu_addr and unregister the dmb from this device. + * Return: zero on success + */ + int (*unregister_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb); + /** + * move_data() - write into a remote dmb + * @dev: Local sending dibs device + * @dmb_tok: Token of the remote dmb + * @idx: signaling index in dmbemask + * @sf: signaling flag; + * if true, idx will be turned on at target dmbemask mask + * and target device will be signaled. + * @offset: offset within target dmb + * @data: pointer to data to be sent + * @size: length of data to be sent, can be zero. + * + * Use dev to write data of size at offset into a remote dmb + * identified by dmb_tok. Data is moved synchronously, *data can + * be freed when this function returns. + * + * If signaling flag (sf) is true, bit number idx bit will be turned + * on in the dmbemask mask when handle_irq() is called at the remote + * dibs client that owns the target dmb. The target device may chose + * to coalesce the signaling triggers of multiple move_data() calls + * to the same target dmb into a single handle_irq() call. + * Return: zero on success + */ + int (*move_data)(struct dibs_dev *dev, u64 dmb_tok, unsigned int idx, + bool sf, unsigned int offset, void *data, + unsigned int size); /** * add_vlan_id() - add dibs device to vlan (optional, deprecated) * @dev: dibs device @@ -166,6 +285,55 @@ struct dibs_dev_ops { * Return: zero on success */ int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * support_mmapped_rdmb() - can this device provide memory mapped + * remote dmbs? (optional) + * @dev: dibs device + * + * A dibs device can provide a kernel address + length, that represent + * a remote target dmb (like MMIO). Alternatively to calling + * move_data(), a dibs client can write into such a ghost-send-buffer + * (= to this kernel address) and the data will automatically + * immediately appear in the target dmb, even without calling + * move_data(). + * + * Either all 3 function pointers for support_dmb_nocopy(), + * attach_dmb() and detach_dmb() are defined, or all of them must + * be NULL. + * + * Return: non-zero, if memory mapped remote dmbs are supported. + */ + int (*support_mmapped_rdmb)(struct dibs_dev *dev); + /** + * attach_dmb() - attach local memory to a remote dmb + * @dev: Local sending ism device + * @dmb: all other parameters are passed in the form of a + * dmb struct + * TODO: (THIS IS CONFUSING, should be changed) + * dmb_tok: (in) Token of the remote dmb, we want to attach to + * cpu_addr: (out) MMIO address + * dma_addr: (out) MMIO address (if applicable, invalid otherwise) + * dmb_len: (out) length of local MMIO region, + * equal to length of remote DMB. + * sba_idx: (out) index of remote dmb (NOT HELPFUL, should be removed) + * + * Provides a memory address to the sender that can be used to + * directly write into the remote dmb. + * Memory is available until detach_dmb is called + * + * Return: Zero upon success, Error code otherwise + */ + int (*attach_dmb)(struct dibs_dev *dev, struct dibs_dmb *dmb); + /** + * detach_dmb() - Detach the ghost buffer from a remote dmb + * @dev: ism device + * @token: dmb token of the remote dmb + * + * No need to free cpu_addr. + * + * Return: Zero upon success, Error code otherwise + */ + int (*detach_dmb)(struct dibs_dev *dev, u64 token); }; struct dibs_dev { @@ -179,6 +347,15 @@ struct dibs_dev { /* priv pointer per client; for client usage only */ void *priv[MAX_DIBS_CLIENTS]; + + /* get this lock before accessing any of the fields below */ + spinlock_t lock; + /* array of client ids indexed by dmb idx; + * can be used as indices into priv and subs arrays + */ + u8 *dmb_clientid_arr; + /* Sparse array of all ISM clients */ + struct dibs_client *subs[MAX_DIBS_CLIENTS]; }; static inline void dibs_set_priv(struct dibs_dev *dev, diff --git a/include/linux/ism.h b/include/linux/ism.h index a926dd61b5a1..b7feb4dcd5a8 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -11,17 +11,6 @@ #include -struct ism_dmb { - u64 dmb_tok; - u64 rgid; - u32 dmb_len; - u32 sba_idx; - u32 vlan_valid; - u32 vlan_id; - void *cpu_addr; - dma_addr_t dma_addr; -}; - /* Unless we gain unexpected popularity, this limit should hold for a while */ #define MAX_CLIENTS 8 #define ISM_NR_DMBS 1920 @@ -36,7 +25,6 @@ struct ism_dev { struct ism_sba *sba; dma_addr_t sba_dma_addr; DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); - u8 *sba_client_arr; /* entries are indices into 'clients' array */ void *priv[MAX_CLIENTS]; struct ism_eq *ieq; @@ -58,11 +46,6 @@ struct ism_event { struct ism_client { const char *name; void (*handle_event)(struct ism_dev *dev, struct ism_event *event); - /* Parameter dmbemask contains a bit vector with updated DMBEs, if sent - * via ism_move_data(). Callback function must handle all active bits - * indicated by dmbemask. - */ - void (*handle_irq)(struct ism_dev *dev, unsigned int bit, u16 dmbemask); /* Private area - don't touch! */ u8 id; }; @@ -79,12 +62,6 @@ static inline void ism_set_priv(struct ism_dev *dev, struct ism_client *client, dev->priv[client->id] = priv; } -int ism_register_dmb(struct ism_dev *dev, struct ism_dmb *dmb, - struct ism_client *client); -int ism_unregister_dmb(struct ism_dev *dev, struct ism_dmb *dmb); -int ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, - unsigned int offset, void *data, unsigned int size); - const struct smcd_ops *ism_get_smcd_ops(void); #endif /* _ISM_H */ diff --git a/include/net/smc.h b/include/net/smc.h index 5bd135fb4d49..8e3debcf7db5 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -28,17 +28,6 @@ struct smc_hashinfo { }; /* SMCD/ISM device driver interface */ -struct smcd_dmb { - u64 dmb_tok; - u64 rgid; - u32 dmb_len; - u32 sba_idx; - u32 vlan_valid; - u32 vlan_id; - void *cpu_addr; - dma_addr_t dma_addr; -}; - #define ISM_EVENT_DMB 0 #define ISM_EVENT_GID 1 #define ISM_EVENT_SWR 2 @@ -53,25 +42,14 @@ struct smcd_gid { }; struct smcd_ops { - int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, - void *client); - int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size); - /* optional operations */ int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, u32 trigger_irq, u32 event_code, u64 info); - int (*support_dmb_nocopy)(struct smcd_dev *dev); - int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*detach_dmb)(struct smcd_dev *dev, u64 token); }; struct smcd_dev { const struct smcd_ops *ops; void *priv; - void *client; struct dibs_dev *dibs; struct list_head list; spinlock_t lock; diff --git a/net/smc/Makefile b/net/smc/Makefile index 96ccfdf246df..0e754cbc38f9 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,4 +6,3 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o -smc-y += smc_loopback.o diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index d20d00b46825..01e49371d23d 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -15,7 +15,6 @@ #include "smc.h" #include "smc_core.h" #include "smc_ism.h" -#include "smc_loopback.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" @@ -33,18 +32,19 @@ static void smcd_register_dev(struct dibs_dev *dibs); static void smcd_unregister_dev(struct dibs_dev *dibs); #if IS_ENABLED(CONFIG_ISM) static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, - u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", .handle_event = smcd_handle_event, - .handle_irq = smcd_handle_irq, }; #endif +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, + u16 dmbemask); + static struct dibs_client_ops smc_client_ops = { .add_dev = smcd_register_dev, .del_dev = smcd_unregister_dev, + .handle_irq = smcd_handle_irq, }; static struct dibs_client smc_dibs_client = { @@ -221,18 +221,19 @@ out: void smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dmb dmb; if (!dmb_desc->dma_addr) return; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - smcd->ops->unregister_dmb(smcd, &dmb); + + smcd->dibs->ops->unregister_dmb(smcd->dibs, &dmb); return; } @@ -240,17 +241,20 @@ void smc_ism_unregister_dmb(struct smcd_dev *smcd, int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dev *dibs; + struct dibs_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; - dmb.rgid = lgr->peer_gid.gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); + copy_to_dibsgid(&dmb.rgid, &lgr->peer_gid); + + dibs = lgr->smcd->dibs; + rc = dibs->ops->register_dmb(dibs, &dmb, &smc_dibs_client); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; @@ -265,24 +269,24 @@ bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) * merging sndbuf with peer DMB to avoid * data copies between them. */ - return (smcd->ops->support_dmb_nocopy && - smcd->ops->support_dmb_nocopy(smcd)); + return (smcd->dibs->ops->support_mmapped_rdmb && + smcd->dibs->ops->support_mmapped_rdmb(smcd->dibs)); } int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dmb dmb; int rc = 0; - if (!dev->ops->attach_dmb) + if (!dev->dibs->ops->attach_dmb) return -EINVAL; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = token; - rc = dev->ops->attach_dmb(dev, &dmb); + rc = dev->dibs->ops->attach_dmb(dev->dibs, &dmb); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; @@ -294,10 +298,10 @@ int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) { - if (!dev->ops->detach_dmb) + if (!dev->dibs->ops->detach_dmb) return -EINVAL; - return dev->ops->detach_dmb(dev, token); + return dev->dibs->ops->detach_dmb(dev->dibs, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -503,26 +507,20 @@ static void smcd_register_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd, *fentry; const struct smcd_ops *ops; - struct smc_lo_dev *smc_lo; struct ism_dev *ism; + int max_dmbs; - if (smc_ism_is_loopback(dibs)) { - if (smc_loopback_init(&smc_lo)) - return; - } + max_dmbs = dibs->ops->max_dmbs(); if (smc_ism_is_loopback(dibs)) { - ops = smc_lo_get_smcd_ops(); - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, - SMC_LO_MAX_DMBS); + ops = NULL; } else { ism = dibs->drv_priv; #if IS_ENABLED(CONFIG_ISM) ops = ism_get_smcd_ops(); #endif - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, - ISM_NR_DMBS); } + smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, max_dmbs); if (!smcd) return; @@ -530,13 +528,11 @@ static void smcd_register_dev(struct dibs_dev *dibs) dibs_set_priv(dibs, &smc_dibs_client, smcd); if (smc_ism_is_loopback(dibs)) { - smcd->priv = smc_lo; - smc_lo->smcd = smcd; + smcd->priv = NULL; } else { smcd->priv = ism; #if IS_ENABLED(CONFIG_ISM) ism_set_priv(ism, &smc_ism_client, smcd); - smcd->client = &smc_ism_client; #endif } @@ -590,8 +586,6 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); - if (smc_ism_is_loopback(dibs)) - smc_loopback_exit(); kfree(smcd->conn); kfree(smcd); } @@ -624,6 +618,7 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } +#endif /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. @@ -632,10 +627,10 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_connection *conn = NULL; unsigned long flags; @@ -645,7 +640,6 @@ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } -#endif int smc_ism_signal_shutdown(struct smc_link_group *lgr) { diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 139e99da2c9f..a1575e31df73 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -69,7 +69,9 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, { int rc; - rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len); + rc = smcd->dibs->ops->move_data(smcd->dibs, dmb_tok, idx, sf, offset, + data, len); + return rc < 0 ? rc : 0; } diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c deleted file mode 100644 index 52cba01cb209..000000000000 --- a/net/smc/smc_loopback.c +++ /dev/null @@ -1,294 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * Functions for loopback-ism device. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu - * Tony Lu - * - */ - -#include -#include -#include -#include - -#include "smc_cdc.h" -#include "smc_ism.h" -#include "smc_loopback.h" - -#define SMC_LO_SUPPORT_NOCOPY 0x1 -#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) - -static struct smc_lo_dev *lo_dev; - -static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, - void *client_priv) -{ - struct smc_lo_dmb_node *dmb_node, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - int sba_idx, rc; - - /* check space for new dmb */ - for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) { - if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) - break; - } - if (sba_idx == SMC_LO_MAX_DMBS) - return -ENOSPC; - - dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); - if (!dmb_node) { - rc = -ENOMEM; - goto err_bit; - } - - dmb_node->sba_idx = sba_idx; - dmb_node->len = dmb->dmb_len; - dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | - __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC); - if (!dmb_node->cpu_addr) { - rc = -ENOMEM; - goto err_node; - } - dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; - refcount_set(&dmb_node->refcnt, 1); - -again: - /* add new dmb into hash table */ - get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); - write_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { - if (tmp_node->token == dmb_node->token) { - write_unlock_bh(&ldev->dmb_ht_lock); - goto again; - } - } - hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); - write_unlock_bh(&ldev->dmb_ht_lock); - atomic_inc(&ldev->dmb_cnt); - - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - - return 0; - -err_node: - kfree(dmb_node); -err_bit: - clear_bit(sba_idx, ldev->sba_idx_mask); - return rc; -} - -static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, - struct smc_lo_dmb_node *dmb_node) -{ - /* remove dmb from hash table */ - write_lock_bh(&ldev->dmb_ht_lock); - hash_del(&dmb_node->list); - write_unlock_bh(&ldev->dmb_ht_lock); - - clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); - kvfree(dmb_node->cpu_addr); - kfree(dmb_node); - - if (atomic_dec_and_test(&ldev->dmb_cnt)) - wake_up(&ldev->ldev_release); -} - -static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb from hash table */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd) -{ - return SMC_LO_SUPPORT_NOCOPY; -} - -static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (!refcount_inc_not_zero(&dmb_node->refcnt)) - /* the dmb is being unregistered, but has - * not been removed from the hash table. - */ - return -EINVAL; - - /* provide dmb information */ - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - return 0; -} - -static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { - if (tmp_node->token == token) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, - unsigned int idx, bool sf, unsigned int offset, - void *data, unsigned int size) -{ - struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - struct smc_connection *conn; - - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { - if (tmp_node->token == dmb_tok) { - rmb_node = tmp_node; - break; - } - } - if (!rmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - memcpy((char *)rmb_node->cpu_addr + offset, data, size); - read_unlock_bh(&ldev->dmb_ht_lock); - - if (!sf) - return 0; - - conn = smcd->conn[rmb_node->sba_idx]; - if (!conn || conn->killed) - return -EPIPE; - tasklet_schedule(&conn->rx_tsklet); - return 0; -} - -static const struct smcd_ops lo_ops = { - .register_dmb = smc_lo_register_dmb, - .unregister_dmb = smc_lo_unregister_dmb, - .support_dmb_nocopy = smc_lo_support_dmb_nocopy, - .attach_dmb = smc_lo_attach_dmb, - .detach_dmb = smc_lo_detach_dmb, - .signal_event = NULL, - .move_data = smc_lo_move_data, -}; - -const struct smcd_ops *smc_lo_get_smcd_ops(void) -{ - return &lo_ops; -} - -static void smc_lo_dev_init(struct smc_lo_dev *ldev) -{ - rwlock_init(&ldev->dmb_ht_lock); - hash_init(ldev->dmb_ht); - atomic_set(&ldev->dmb_cnt, 0); - init_waitqueue_head(&ldev->ldev_release); - - return; -} - -static void smc_lo_dev_exit(struct smc_lo_dev *ldev) -{ - if (atomic_read(&ldev->dmb_cnt)) - wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); -} - -static int smc_lo_dev_probe(void) -{ - struct smc_lo_dev *ldev; - - ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); - if (!ldev) - return -ENOMEM; - - smc_lo_dev_init(ldev); - - lo_dev = ldev; /* global loopback device */ - - return 0; -} - -static void smc_lo_dev_remove(void) -{ - if (!lo_dev) - return; - - smc_lo_dev_exit(lo_dev); - kfree(lo_dev); - lo_dev = NULL; -} - -int smc_loopback_init(struct smc_lo_dev **smc_lb) -{ - int ret; - - ret = smc_lo_dev_probe(); - if (!ret) - *smc_lb = lo_dev; - return ret; -} - -void smc_loopback_exit(void) -{ - smc_lo_dev_remove(); -} diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h deleted file mode 100644 index 33bb96ec8b77..000000000000 --- a/net/smc/smc_loopback.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * SMC-D loopback-ism device structure definitions. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu - * Tony Lu - * - */ - -#ifndef _SMC_LOOPBACK_H -#define _SMC_LOOPBACK_H - -#include -#include - -#define SMC_LO_MAX_DMBS 5000 -#define SMC_LO_DMBS_HASH_BITS 12 - -struct smc_lo_dmb_node { - struct hlist_node list; - u64 token; - u32 len; - u32 sba_idx; - void *cpu_addr; - dma_addr_t dma_addr; - refcount_t refcnt; -}; - -struct smc_lo_dev { - struct smcd_dev *smcd; - atomic_t dmb_cnt; - rwlock_t dmb_ht_lock; - DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); - DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS); - wait_queue_head_t ldev_release; -}; - -const struct smcd_ops *smc_lo_get_smcd_ops(void); - -int smc_loopback_init(struct smc_lo_dev **smc_lb); -void smc_loopback_exit(void); - -#endif /* _SMC_LOOPBACK_H */ -- cgit v1.2.3 From a612dbe8d04d47af91fa88f0599c1370cc70f687 Mon Sep 17 00:00:00 2001 From: Julian Ruess Date: Thu, 18 Sep 2025 13:05:00 +0200 Subject: dibs: Move event handling to dibs layer Add defines for all event types and subtypes an ism device is known to produce as it can be helpful for debugging purposes. Introduces a generic 'struct dibs_event' and adopt ism device driver and smc-d client accordingly. Tolerate and ignore other type and subtype values to enable future device extensions. SMC-D and ISM are now independent. struct ism_dev can be moved to drivers/s390/net/ism.h. Note that in smc, the term 'ism' is still used. Future patches could replace that with 'dibs' or 'smc-d' as appropriate. Signed-off-by: Julian Ruess Co-developed-by: Alexandra Winter Signed-off-by: Alexandra Winter Link: https://patch.msgid.link/20250918110500.1731261-15-wintera@linux.ibm.com Signed-off-by: Paolo Abeni --- MAINTAINERS | 2 - arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + drivers/dibs/dibs_main.c | 2 +- drivers/s390/net/Kconfig | 1 - drivers/s390/net/ism.h | 42 +++++++- drivers/s390/net/ism_drv.c | 221 ++++++++++++-------------------------- include/linux/dibs.h | 62 +++++++++++ include/net/smc.h | 15 --- net/smc/Kconfig | 1 - net/smc/smc_ism.c | 99 ++++++----------- 11 files changed, 208 insertions(+), 239 deletions(-) (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index ecc55fae5f9d..1fbe541198f7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17580,7 +17580,6 @@ F: include/linux/fddidevice.h F: include/linux/hippidevice.h F: include/linux/if_* F: include/linux/inetdevice.h -F: include/linux/ism.h F: include/linux/netdev* F: include/linux/platform_data/wiznet.h F: include/uapi/linux/cn_proc.h @@ -22237,7 +22236,6 @@ L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported F: drivers/s390/net/ -F: include/linux/ism.h S390 PCI SUBSYSTEM M: Niklas Schnelle diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index a97c8d19f643..fdde8ee0d7bd 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -122,6 +122,7 @@ CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y +CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 7f7b52d9a33c..bf9e7dbd4a89 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -113,6 +113,7 @@ CONFIG_XFRM_USER=m CONFIG_NET_KEY=m CONFIG_DIBS=y CONFIG_DIBS_LO=y +CONFIG_SMC=m CONFIG_SMC_DIAG=m CONFIG_INET=y CONFIG_IP_MULTICAST=y diff --git a/drivers/dibs/dibs_main.c b/drivers/dibs/dibs_main.c index aacb3ea7825a..5425238d5a42 100644 --- a/drivers/dibs/dibs_main.c +++ b/drivers/dibs/dibs_main.c @@ -98,7 +98,7 @@ int dibs_unregister_client(struct dibs_client *client) goto err_reg_dmb; } } - /* Stop forwarding IRQs */ + /* Stop forwarding IRQs and events */ dibs->subs[client->id] = NULL; spin_unlock_irqrestore(&dibs->lock, flags); clients[client->id]->ops->del_dev(dibs); diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig index 92985f595d59..0fd700c5745a 100644 --- a/drivers/s390/net/Kconfig +++ b/drivers/s390/net/Kconfig @@ -82,7 +82,6 @@ config CCWGROUP config ISM tristate "Support for ISM vPCI Adapter" depends on PCI && DIBS - imply SMC default n help Select this option if you want to use the Internal Shared Memory diff --git a/drivers/s390/net/ism.h b/drivers/s390/net/ism.h index 1b9fa14da20c..08d17956cb36 100644 --- a/drivers/s390/net/ism.h +++ b/drivers/s390/net/ism.h @@ -6,13 +6,13 @@ #include #include #include -#include -#include #include #define UTIL_STR_LEN 16 #define ISM_ERROR 0xFFFF +#define ISM_NR_DMBS 1920 + /* * Do not use the first word of the DMB bits to ensure 8 byte aligned access. */ @@ -34,6 +34,23 @@ #define ISM_UNREG_SBA 0x11 #define ISM_UNREG_IEQ 0x12 +enum ism_event_type { + ISM_EVENT_BUF = 0x00, + ISM_EVENT_DEV = 0x01, + ISM_EVENT_SWR = 0x02 +}; + +enum ism_event_code { + ISM_BUF_DMB_UNREGISTERED = 0x04, + ISM_BUF_USING_ISM_DEV_DISABLED = 0x08, + ISM_BUF_OWNING_ISM_DEV_IN_ERR_STATE = 0x02, + ISM_BUF_USING_ISM_DEV_IN_ERR_STATE = 0x03, + ISM_BUF_VLAN_MISMATCH_WITH_OWNER = 0x05, + ISM_BUF_VLAN_MISMATCH_WITH_USER = 0x06, + ISM_DEV_GID_DISABLED = 0x07, + ISM_DEV_GID_ERR_STATE = 0x01 +}; + struct ism_req_hdr { u32 cmd; u16 : 16; @@ -185,6 +202,14 @@ struct ism_eq_header { u64 : 64; }; +struct ism_event { + u32 type; + u32 code; + u64 tok; + u64 time; + u64 info; +}; + struct ism_eq { struct ism_eq_header header; struct ism_event entry[15]; @@ -199,6 +224,19 @@ struct ism_sba { u16 dmbe_mask[ISM_NR_DMBS]; }; +struct ism_dev { + spinlock_t cmd_lock; /* serializes cmds */ + struct dibs_dev *dibs; + struct pci_dev *pdev; + struct ism_sba *sba; + dma_addr_t sba_dma_addr; + DECLARE_BITMAP(sba_bitmap, ISM_NR_DMBS); + + struct ism_eq *ieq; + dma_addr_t ieq_dma_addr; + int ieq_idx; +}; + #define ISM_CREATE_REQ(dmb, idx, sf, offset) \ ((dmb) | (idx) << 24 | (sf) << 23 | (offset)) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 346d1ea8650b..f84aa2e676e9 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -31,86 +31,6 @@ MODULE_DEVICE_TABLE(pci, ism_device_table); static debug_info_t *ism_debug_info; -#define NO_CLIENT 0xff /* must be >= MAX_CLIENTS */ -static struct ism_client *clients[MAX_CLIENTS]; /* use an array rather than */ - /* a list for fast mapping */ -static u8 max_client; -static DEFINE_MUTEX(clients_lock); -struct ism_dev_list { - struct list_head list; - struct mutex mutex; /* protects ism device list */ -}; - -static struct ism_dev_list ism_dev_list = { - .list = LIST_HEAD_INIT(ism_dev_list.list), - .mutex = __MUTEX_INITIALIZER(ism_dev_list.mutex), -}; - -static void ism_setup_forwarding(struct ism_client *client, struct ism_dev *ism) -{ - unsigned long flags; - - spin_lock_irqsave(&ism->lock, flags); - ism->subs[client->id] = client; - spin_unlock_irqrestore(&ism->lock, flags); -} - -int ism_register_client(struct ism_client *client) -{ - struct ism_dev *ism; - int i, rc = -ENOSPC; - - mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < MAX_CLIENTS; ++i) { - if (!clients[i]) { - clients[i] = client; - client->id = i; - if (i == max_client) - max_client++; - rc = 0; - break; - } - } - mutex_unlock(&clients_lock); - - if (i < MAX_CLIENTS) { - /* initialize with all devices that we got so far */ - list_for_each_entry(ism, &ism_dev_list.list, list) { - ism->priv[i] = NULL; - ism_setup_forwarding(client, ism); - } - } - mutex_unlock(&ism_dev_list.mutex); - - return rc; -} -EXPORT_SYMBOL_GPL(ism_register_client); - -int ism_unregister_client(struct ism_client *client) -{ - struct ism_dev *ism; - unsigned long flags; - int rc = 0; - - mutex_lock(&ism_dev_list.mutex); - list_for_each_entry(ism, &ism_dev_list.list, list) { - spin_lock_irqsave(&ism->lock, flags); - /* Stop forwarding IRQs and events */ - ism->subs[client->id] = NULL; - spin_unlock_irqrestore(&ism->lock, flags); - } - mutex_unlock(&ism_dev_list.mutex); - - mutex_lock(&clients_lock); - clients[client->id] = NULL; - if (client->id + 1 == max_client) - max_client--; - mutex_unlock(&clients_lock); - return rc; -} -EXPORT_SYMBOL_GPL(ism_unregister_client); - static int ism_cmd(struct ism_dev *ism, void *cmd) { struct ism_req_hdr *req = cmd; @@ -445,6 +365,24 @@ static int ism_del_vlan_id(struct dibs_dev *dibs, u64 vlan_id) return ism_cmd(ism, &cmd); } +static int ism_signal_ieq(struct dibs_dev *dibs, const uuid_t *rgid, + u32 trigger_irq, u32 event_code, u64 info) +{ + struct ism_dev *ism = dibs->drv_priv; + union ism_sig_ieq cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; + cmd.request.hdr.len = sizeof(cmd.request); + + memcpy(&cmd.request.rgid, rgid, sizeof(cmd.request.rgid)); + cmd.request.trigger_irq = trigger_irq; + cmd.request.event_code = event_code; + cmd.request.info = info; + + return ism_cmd(ism, &cmd); +} + static unsigned int max_bytes(unsigned int start, unsigned int len, unsigned int boundary) { @@ -487,22 +425,68 @@ static u16 ism_get_chid(struct dibs_dev *dibs) return to_zpci(ism->pdev)->pchid; } +static int ism_match_event_type(u32 s390_event_type) +{ + switch (s390_event_type) { + case ISM_EVENT_BUF: + return DIBS_BUF_EVENT; + case ISM_EVENT_DEV: + return DIBS_DEV_EVENT; + case ISM_EVENT_SWR: + return DIBS_SW_EVENT; + default: + return DIBS_OTHER_TYPE; + } +} + +static int ism_match_event_subtype(u32 s390_event_subtype) +{ + switch (s390_event_subtype) { + case ISM_BUF_DMB_UNREGISTERED: + return DIBS_BUF_UNREGISTERED; + case ISM_DEV_GID_DISABLED: + return DIBS_DEV_DISABLED; + case ISM_DEV_GID_ERR_STATE: + return DIBS_DEV_ERR_STATE; + default: + return DIBS_OTHER_SUBTYPE; + } +} + static void ism_handle_event(struct ism_dev *ism) { + struct dibs_dev *dibs = ism->dibs; + struct dibs_event event; struct ism_event *entry; - struct ism_client *clt; + struct dibs_client *clt; int i; while ((ism->ieq_idx + 1) != READ_ONCE(ism->ieq->header.idx)) { - if (++(ism->ieq_idx) == ARRAY_SIZE(ism->ieq->entry)) + if (++ism->ieq_idx == ARRAY_SIZE(ism->ieq->entry)) ism->ieq_idx = 0; entry = &ism->ieq->entry[ism->ieq_idx]; debug_event(ism_debug_info, 2, entry, sizeof(*entry)); - for (i = 0; i < max_client; ++i) { - clt = ism->subs[i]; + __memset(&event, 0, sizeof(event)); + event.type = ism_match_event_type(entry->type); + if (event.type == DIBS_SW_EVENT) + event.subtype = entry->code; + else + event.subtype = ism_match_event_subtype(entry->code); + event.time = entry->time; + event.data = entry->info; + switch (event.type) { + case DIBS_BUF_EVENT: + event.buffer_tok = entry->tok; + break; + case DIBS_DEV_EVENT: + case DIBS_SW_EVENT: + memcpy(&event.gid, &entry->tok, sizeof(u64)); + } + for (i = 0; i < MAX_DIBS_CLIENTS; ++i) { + clt = dibs->subs[i]; if (clt) - clt->handle_event(ism, entry); + clt->ops->handle_event(dibs, &event); } } } @@ -560,12 +544,13 @@ static const struct dibs_dev_ops ism_ops = { .move_data = ism_move, .add_vlan_id = ism_add_vlan_id, .del_vlan_id = ism_del_vlan_id, + .signal_event = ism_signal_ieq, }; static int ism_dev_init(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; - int i, ret; + int ret; ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI); if (ret <= 0) @@ -584,18 +569,6 @@ static int ism_dev_init(struct ism_dev *ism) if (ret) goto unreg_sba; - mutex_lock(&ism_dev_list.mutex); - mutex_lock(&clients_lock); - for (i = 0; i < max_client; ++i) { - if (clients[i]) { - ism_setup_forwarding(clients[i], ism); - } - } - mutex_unlock(&clients_lock); - - list_add(&ism->list, &ism_dev_list.list); - mutex_unlock(&ism_dev_list.mutex); - query_info(ism); return 0; @@ -612,22 +585,11 @@ out: static void ism_dev_exit(struct ism_dev *ism) { struct pci_dev *pdev = ism->pdev; - unsigned long flags; - int i; - - spin_lock_irqsave(&ism->lock, flags); - for (i = 0; i < max_client; ++i) - ism->subs[i] = NULL; - spin_unlock_irqrestore(&ism->lock, flags); - - mutex_lock(&ism_dev_list.mutex); unregister_ieq(ism); unregister_sba(ism); free_irq(pci_irq_vector(pdev, 0), ism); pci_free_irq_vectors(pdev); - list_del_init(&ism->list); - mutex_unlock(&ism_dev_list.mutex); } static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) @@ -641,7 +603,6 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!ism) return -ENOMEM; - spin_lock_init(&ism->lock); spin_lock_init(&ism->cmd_lock); dev_set_drvdata(&pdev->dev, ism); ism->pdev = pdev; @@ -742,8 +703,6 @@ static int __init ism_init(void) if (!ism_debug_info) return -ENODEV; - memset(clients, 0, sizeof(clients)); - max_client = 0; debug_register_view(ism_debug_info, &debug_hex_ascii_view); ret = pci_register_driver(&ism_driver); if (ret) @@ -760,41 +719,3 @@ static void __exit ism_exit(void) module_init(ism_init); module_exit(ism_exit); - -/*************************** SMC-D Implementation *****************************/ - -#if IS_ENABLED(CONFIG_SMC) -static int ism_signal_ieq(struct ism_dev *ism, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info) -{ - union ism_sig_ieq cmd; - - memset(&cmd, 0, sizeof(cmd)); - cmd.request.hdr.cmd = ISM_SIGNAL_IEQ; - cmd.request.hdr.len = sizeof(cmd.request); - - cmd.request.rgid = rgid; - cmd.request.trigger_irq = trigger_irq; - cmd.request.event_code = event_code; - cmd.request.info = info; - - return ism_cmd(ism, &cmd); -} - -static int smcd_signal_ieq(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 trigger_irq, u32 event_code, u64 info) -{ - return ism_signal_ieq(smcd->priv, rgid->gid, - trigger_irq, event_code, info); -} - -static const struct smcd_ops ism_smcd_ops = { - .signal_event = smcd_signal_ieq, -}; - -const struct smcd_ops *ism_get_smcd_ops(void) -{ - return &ism_smcd_ops; -} -EXPORT_SYMBOL_GPL(ism_get_smcd_ops); -#endif diff --git a/include/linux/dibs.h b/include/linux/dibs.h index be009c614205..c75607f8a5cf 100644 --- a/include/linux/dibs.h +++ b/include/linux/dibs.h @@ -67,6 +67,41 @@ struct dibs_dmb { dma_addr_t dma_addr; }; +/* DIBS events + * ----------- + * Dibs devices can optionally notify dibs clients about events that happened + * in the fabric or at the remote device or remote dmb. + */ +enum dibs_event_type { + /* Buffer event, e.g. a remote dmb was unregistered */ + DIBS_BUF_EVENT, + /* Device event, e.g. a remote dibs device was disabled */ + DIBS_DEV_EVENT, + /* Software event, a dibs client can send an event signal to a + * remote dibs device. + */ + DIBS_SW_EVENT, + DIBS_OTHER_TYPE }; + +enum dibs_event_subtype { + DIBS_BUF_UNREGISTERED, + DIBS_DEV_DISABLED, + DIBS_DEV_ERR_STATE, + DIBS_OTHER_SUBTYPE +}; + +struct dibs_event { + u32 type; + u32 subtype; + /* uuid_null if invalid */ + uuid_t gid; + /* zero if invalid */ + u64 buffer_tok; + u64 time; + /* additional data or zero */ + u64 data; +}; + struct dibs_dev; /* DIBS client @@ -117,6 +152,15 @@ struct dibs_client_ops { */ void (*handle_irq)(struct dibs_dev *dev, unsigned int idx, u16 dmbemask); + /** + * handle_event() - Handle control information sent by device + * @dev: device reporting the event + * @event: ism event structure + * + * * Context: Called in IRQ context by dibs device driver + */ + void (*handle_event)(struct dibs_dev *dev, + const struct dibs_event *event); }; struct dibs_client { @@ -285,6 +329,24 @@ struct dibs_dev_ops { * Return: zero on success */ int (*del_vlan_id)(struct dibs_dev *dev, u64 vlan_id); + /** + * signal_event() - trigger an event at a remote dibs device (optional) + * @dev: local dibs device + * @rgid: gid of remote dibs device + * trigger_irq: zero: notification may be coalesced with other events + * non-zero: notify immediately + * @subtype: 4 byte event code, meaning is defined by dibs client + * @data: 8 bytes of additional information, + * meaning is defined by dibs client + * + * dibs devices can offer support for sending a control event of type + * EVENT_SWR to a remote dibs device. + * NOTE: handle_event() will be called for all registered dibs clients + * at the remote device. + * Return: zero on success + */ + int (*signal_event)(struct dibs_dev *dev, const uuid_t *rgid, + u32 trigger_irq, u32 event_code, u64 info); /** * support_mmapped_rdmb() - can this device provide memory mapped * remote dmbs? (optional) diff --git a/include/net/smc.h b/include/net/smc.h index 8e3debcf7db5..08bee529ed8d 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -16,7 +16,6 @@ #include #include #include -#include "linux/ism.h" struct sock; @@ -28,28 +27,14 @@ struct smc_hashinfo { }; /* SMCD/ISM device driver interface */ -#define ISM_EVENT_DMB 0 -#define ISM_EVENT_GID 1 -#define ISM_EVENT_SWR 2 - #define ISM_RESERVED_VLANID 0x1FFF -struct smcd_dev; - struct smcd_gid { u64 gid; u64 gid_ext; }; -struct smcd_ops { - /* optional operations */ - int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 trigger_irq, u32 event_code, u64 info); -}; - struct smcd_dev { - const struct smcd_ops *ops; - void *priv; struct dibs_dev *dibs; struct list_head list; spinlock_t lock; diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 9535d88c2acb..99ecd59d1f4b 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -2,7 +2,6 @@ config SMC tristate "SMC socket protocol family" depends on INET && INFINIBAND && DIBS - depends on m || ISM != m help SMC-R provides a "sockets over RDMA" solution making use of RDMA over Converged Ethernet (RoCE) technology to upgrade diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 01e49371d23d..7b228ca2f96a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -17,7 +17,6 @@ #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" -#include "linux/ism.h" #include "linux/dibs.h" struct smcd_dev_list smcd_dev_list = { @@ -30,20 +29,15 @@ static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; static void smcd_register_dev(struct dibs_dev *dibs); static void smcd_unregister_dev(struct dibs_dev *dibs); -#if IS_ENABLED(CONFIG_ISM) -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); - -static struct ism_client smc_ism_client = { - .name = "SMC-D", - .handle_event = smcd_handle_event, -}; -#endif +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event); static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask); static struct dibs_client_ops smc_client_ops = { .add_dev = smcd_register_dev, .del_dev = smcd_unregister_dev, + .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; @@ -399,11 +393,10 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -#if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; - struct ism_event event; + struct dibs_event event; }; #define ISM_EVENT_REQUEST 0x0001 @@ -423,25 +416,27 @@ union smcd_sw_event_info { static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { - struct smcd_gid peer_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct dibs_dev *dibs = wrk->smcd->dibs; union smcd_sw_event_info ev_info; + struct smcd_gid peer_gid; + uuid_t ism_rgid; - ev_info.info = wrk->event.info; - switch (wrk->event.code) { + copy_to_smcdgid(&peer_gid, &wrk->event.gid); + ev_info.info = wrk->event.data; + switch (wrk->event.subtype) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST && - wrk->smcd->ops->signal_event) { + dibs->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; - wrk->smcd->ops->signal_event(wrk->smcd, - &peer_gid, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_TESTLINK, - ev_info.info); - } + copy_to_dibsgid(&ism_rgid, &peer_gid); + dibs->ops->signal_event(dibs, &ism_rgid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_TESTLINK, + ev_info.info); + } break; } } @@ -451,26 +446,24 @@ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); - struct smcd_gid smcd_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct smcd_gid smcd_gid; + + copy_to_smcdgid(&smcd_gid, &wrk->event.gid); switch (wrk->event.type) { - case ISM_EVENT_GID: /* GID event, token is peer GID */ + case DIBS_DEV_EVENT: /* GID event, token is peer GID */ smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; - case ISM_EVENT_DMB: + case DIBS_BUF_EVENT: break; - case ISM_EVENT_SWR: /* Software defined event */ + case DIBS_SW_EVENT: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } -#endif -static struct smcd_dev *smcd_alloc_dev(const char *name, - const struct smcd_ops *ops, - int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, int max_dmbs) { struct smcd_dev *smcd; @@ -487,8 +480,6 @@ static struct smcd_dev *smcd_alloc_dev(const char *name, if (!smcd->event_wq) goto free_conn; - smcd->ops = ops; - spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); @@ -506,36 +497,17 @@ free_smcd: static void smcd_register_dev(struct dibs_dev *dibs) { struct smcd_dev *smcd, *fentry; - const struct smcd_ops *ops; - struct ism_dev *ism; int max_dmbs; max_dmbs = dibs->ops->max_dmbs(); - if (smc_ism_is_loopback(dibs)) { - ops = NULL; - } else { - ism = dibs->drv_priv; -#if IS_ENABLED(CONFIG_ISM) - ops = ism_get_smcd_ops(); -#endif - } - smcd = smcd_alloc_dev(dev_name(&dibs->dev), ops, max_dmbs); + smcd = smcd_alloc_dev(dev_name(&dibs->dev), max_dmbs); if (!smcd) return; smcd->dibs = dibs; dibs_set_priv(dibs, &smc_dibs_client, smcd); - if (smc_ism_is_loopback(dibs)) { - smcd->priv = NULL; - } else { - smcd->priv = ism; -#if IS_ENABLED(CONFIG_ISM) - ism_set_priv(ism, &smc_ism_client, smcd); -#endif - } - if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); @@ -590,7 +562,6 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) kfree(smcd); } -#if IS_ENABLED(CONFIG_ISM) /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), @@ -602,9 +573,10 @@ static void smcd_unregister_dev(struct dibs_dev *dibs) * Context: * - Function called in IRQ context from ISM device driver event handler. */ -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_ism_event_work *wrk; if (smcd->going_away) @@ -618,7 +590,6 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } -#endif /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. @@ -644,22 +615,22 @@ static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, int smc_ism_signal_shutdown(struct smc_link_group *lgr) { int rc = 0; -#if IS_ENABLED(CONFIG_ISM) union smcd_sw_event_info ev_info; + uuid_t ism_rgid; if (lgr->peer_shutdown) return 0; - if (!lgr->smcd->ops->signal_event) + if (!lgr->smcd->dibs->ops->signal_event) return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid, + copy_to_dibsgid(&ism_rgid, &lgr->peer_gid); + rc = lgr->smcd->dibs->ops->signal_event(lgr->smcd->dibs, &ism_rgid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); -#endif return rc; } @@ -670,9 +641,6 @@ int smc_ism_init(void) smc_ism_v2_capable = false; smc_ism_create_system_eid(); -#if IS_ENABLED(CONFIG_ISM) - rc = ism_register_client(&smc_ism_client); -#endif rc = dibs_register_client(&smc_dibs_client); return rc; } @@ -680,7 +648,4 @@ int smc_ism_init(void) void smc_ism_exit(void) { dibs_unregister_client(&smc_dibs_client); -#if IS_ENABLED(CONFIG_ISM) - ism_unregister_client(&smc_ism_client); -#endif } -- cgit v1.2.3 From 9095d207417477eb50e84fd0652895db77ec584e Mon Sep 17 00:00:00 2001 From: André Almeida Date: Thu, 14 Aug 2025 14:22:12 -0300 Subject: fs: Create sb_encoding() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Filesystems that need to deal with the super block encoding need to use a if IS_ENABLED(CONFIG_UNICODE) around it because this struct member is not declared otherwise. In order to move this if/endif guards outside of the filesytem code and make it simpler, create a new function that returns the s_encoding member of struct super_block if Unicode is enabled, and return NULL otherwise. Suggested-by: Amir Goldstein Reviewed-by: Amir Goldstein Signed-off-by: André Almeida Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Amir Goldstein --- include/linux/fs.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..43b3a7cf6750 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3740,15 +3740,20 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qst } #endif -static inline bool sb_has_encoding(const struct super_block *sb) +static inline struct unicode_map *sb_encoding(const struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) - return !!sb->s_encoding; + return sb->s_encoding; #else - return false; + return NULL; #endif } +static inline bool sb_has_encoding(const struct super_block *sb) +{ + return !!sb_encoding(sb); +} + int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); -- cgit v1.2.3 From 23253e278a4574114d4c2729ed70f70b4ec7a30e Mon Sep 17 00:00:00 2001 From: André Almeida Date: Thu, 14 Aug 2025 14:22:13 -0300 Subject: fs: Create sb_same_encoding() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For cases where a file lookup can look in different filesystems (like in overlayfs), both super blocks must have the same encoding and the same flags. To help with that, create a sb_same_encoding() function. Reviewed-by: Amir Goldstein Signed-off-by: André Almeida Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Amir Goldstein --- include/linux/fs.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 43b3a7cf6750..ec867f112fd5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3754,6 +3754,24 @@ static inline bool sb_has_encoding(const struct super_block *sb) return !!sb_encoding(sb); } +/* + * Compare if two super blocks have the same encoding and flags + */ +static inline bool sb_same_encoding(const struct super_block *sb1, + const struct super_block *sb2) +{ +#if IS_ENABLED(CONFIG_UNICODE) + if (sb1->s_encoding == sb2->s_encoding) + return true; + + return (sb1->s_encoding && sb2->s_encoding && + (sb1->s_encoding->version == sb2->s_encoding->version) && + (sb1->s_encoding_flags == sb2->s_encoding_flags)); +#else + return true; +#endif +} + int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); -- cgit v1.2.3 From 0494fc345b377d1207c2cbfef67dc51f6ec874c0 Mon Sep 17 00:00:00 2001 From: Gokul Praveen Date: Tue, 12 Aug 2025 16:23:46 +0530 Subject: clocksource/drivers/timer-ti-dm : Capture functionality for OMAP DM timer Add PWM capture function in DM timer driver. OMAP DM timer hardware supports capture feature.It can be used to timestamp events (falling/rising edges) detected on input signal. Signed-off-by: Gokul Praveen Signed-off-by: Daniel Lezcano Reviewed-by: Neha Malcom Francis Link: https://lore.kernel.org/r/20250812105346.203541-1-g-praveen@ti.com --- drivers/clocksource/timer-ti-dm.c | 119 ++++++++++++++++++++++++++++- include/linux/platform_data/dmtimer-omap.h | 4 + 2 files changed, 121 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index e9e32df6b566..793e7cdcb1b1 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -31,6 +31,7 @@ #include #include +#include /* * timer errata flags @@ -836,6 +837,48 @@ static int omap_dm_timer_set_match(struct omap_dm_timer *cookie, int enable, return 0; } +static int omap_dm_timer_set_cap(struct omap_dm_timer *cookie, + int autoreload, bool config_period) +{ + struct dmtimer *timer; + struct device *dev; + int rc; + u32 l; + + timer = to_dmtimer(cookie); + if (unlikely(!timer)) + return -EINVAL; + + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + /* + * 1. Select autoreload mode. TIMER_TCLR[1] AR bit. + * 2. TIMER_TCLR[14]: Sets the functionality of the TIMER IO pin. + * 3. TIMER_TCLR[13] : Capture mode select bit. + * 3. TIMER_TCLR[9-8] : Select transition capture mode. + */ + + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + + if (autoreload) + l |= OMAP_TIMER_CTRL_AR; + + l |= OMAP_TIMER_CTRL_CAPTMODE | OMAP_TIMER_CTRL_GPOCFG; + + if (config_period == true) + l |= OMAP_TIMER_CTRL_TCM_LOWTOHIGH; /* Time Period config */ + else + l |= OMAP_TIMER_CTRL_TCM_BOTHEDGES; /* Duty Cycle config */ + + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + pm_runtime_put_sync(dev); + + return 0; +} + static int omap_dm_timer_set_pwm(struct omap_dm_timer *cookie, int def_on, int toggle, int trigger, int autoreload) { @@ -1023,23 +1066,92 @@ static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *cookie) return __omap_dm_timer_read_counter(timer); } +static inline unsigned int __omap_dm_timer_cap(struct dmtimer *timer, int idx) +{ + return idx == 0 ? dmtimer_read(timer, OMAP_TIMER_CAPTURE_REG) : + dmtimer_read(timer, OMAP_TIMER_CAPTURE2_REG); +} + static int omap_dm_timer_write_counter(struct omap_dm_timer *cookie, unsigned int value) { struct dmtimer *timer; + struct device *dev; timer = to_dmtimer(cookie); - if (unlikely(!timer || !atomic_read(&timer->enabled))) { - pr_err("%s: timer not available or enabled.\n", __func__); + if (unlikely(!timer)) { + pr_err("%s: timer not available.\n", __func__); return -EINVAL; } + dev = &timer->pdev->dev; + + pm_runtime_resume_and_get(dev); dmtimer_write(timer, OMAP_TIMER_COUNTER_REG, value); + pm_runtime_put_sync(dev); /* Save the context */ timer->context.tcrr = value; return 0; } +/** + * omap_dm_timer_cap_counter() - Calculate the high count or period count depending on the + * configuration. + * @cookie:Pointer to OMAP DM timer + * @is_period:Whether to configure timer in period or duty cycle mode + * + * Return high count or period count if timer is enabled else appropriate error. + */ +static unsigned int omap_dm_timer_cap_counter(struct omap_dm_timer *cookie, bool is_period) +{ + struct dmtimer *timer; + unsigned int cap1 = 0; + unsigned int cap2 = 0; + u32 l, ret; + + timer = to_dmtimer(cookie); + if (unlikely(!timer || !atomic_read(&timer->enabled))) { + pr_err("%s:timer is not available or enabled.%p\n", __func__, (void *)timer); + return -EINVAL; + } + + /* Stop the timer */ + omap_dm_timer_stop(cookie); + + /* Clear the timer counter value to 0 */ + ret = omap_dm_timer_write_counter(cookie, 0); + if (ret) + return ret; + + /* Sets the timer capture configuration for period/duty cycle calculation */ + ret = omap_dm_timer_set_cap(cookie, true, is_period); + if (ret) { + pr_err("%s: Failed to set timer capture configuration.\n", __func__); + return ret; + } + /* Start the timer */ + omap_dm_timer_start(cookie); + + /* + * 1 sec delay is given so as to provide + * enough time to capture low frequency signals. + */ + msleep(1000); + + cap1 = __omap_dm_timer_cap(timer, 0); + cap2 = __omap_dm_timer_cap(timer, 1); + + /* + * Clears the TCLR configuration. + * The start bit must be set to 1 as the timer is already in start mode. + */ + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + l &= ~(0xffff) | 0x1; + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + return (cap2-cap1); +} + static int __maybe_unused omap_dm_timer_runtime_suspend(struct device *dev) { struct dmtimer *timer = dev_get_drvdata(dev); @@ -1246,6 +1358,9 @@ static const struct omap_dm_timer_ops dmtimer_ops = { .write_counter = omap_dm_timer_write_counter, .read_status = omap_dm_timer_read_status, .write_status = omap_dm_timer_write_status, + .set_cap = omap_dm_timer_set_cap, + .get_cap_status = omap_dm_timer_get_pwm_status, + .read_cap = omap_dm_timer_cap_counter, }; static const struct dmtimer_platform_data omap3plus_pdata = { diff --git a/include/linux/platform_data/dmtimer-omap.h b/include/linux/platform_data/dmtimer-omap.h index 95d852aef130..726d89143842 100644 --- a/include/linux/platform_data/dmtimer-omap.h +++ b/include/linux/platform_data/dmtimer-omap.h @@ -36,9 +36,13 @@ struct omap_dm_timer_ops { int (*set_pwm)(struct omap_dm_timer *timer, int def_on, int toggle, int trigger, int autoreload); int (*get_pwm_status)(struct omap_dm_timer *timer); + int (*set_cap)(struct omap_dm_timer *timer, + int autoreload, bool config_period); + int (*get_cap_status)(struct omap_dm_timer *timer); int (*set_prescaler)(struct omap_dm_timer *timer, int prescaler); unsigned int (*read_counter)(struct omap_dm_timer *timer); + unsigned int (*read_cap)(struct omap_dm_timer *timer, bool is_period); int (*write_counter)(struct omap_dm_timer *timer, unsigned int value); unsigned int (*read_status)(struct omap_dm_timer *timer); -- cgit v1.2.3 From 17eb98d6b517b6e7faaebed496fd688dbb1771d9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:48 +1000 Subject: VFS/ovl: add lookup_one_positive_killable() ovl wants a lookup which won't block on a fatal signal. It currently uses down_write_killable() and then repeatedly calls to lookup_one() The lock may not be needed if the name is already in the dcache and it aids proposed future changes if the locking is kept internal to namei.c So this patch adds lookup_one_positive_killable() which is like lookup_one_positive() but will abort in the face of a fatal signal. overlayfs is changed to use this. Note that instead of always getting an exclusive lock, ovl now only gets a shared lock, and only sometimes. The exclusive lock was never needed. However down_read_killable() was only added in v4.15 but overlayfs started using down_write_killable() here in v4.7. Note that the linked list ->first_maybe_whiteout ->next_maybe_white is local to the thread so there is no concurrency in that list which could be threatened by removing the locking. Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/namei.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/overlayfs/readdir.c | 28 ++++++++++++------------- include/linux/namei.h | 3 +++ 3 files changed, 72 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index cd43ff89fbaa..c7c6b255db2c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1827,6 +1827,20 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } +static struct dentry *lookup_slow_killable(const struct qstr *name, + struct dentry *dir, + unsigned int flags) +{ + struct inode *inode = dir->d_inode; + struct dentry *res; + + if (inode_lock_shared_killable(inode)) + return ERR_PTR(-EINTR); + res = __lookup_slow(name, dir, flags); + inode_unlock_shared(inode); + return res; +} + static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *restrict nd) { @@ -3010,6 +3024,47 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, } EXPORT_SYMBOL(lookup_one_unlocked); +/** + * lookup_one_positive_killable - lookup single pathname component + * @idmap: idmap of the mount the lookup is performed from + * @name: qstr olding pathname component to lookup + * @base: base directory to lookup from + * + * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns + * known positive or ERR_PTR(). This is what most of the users want. + * + * Note that pinned negative with unlocked parent _can_ become positive at any + * time, so callers of lookup_one_unlocked() need to be very careful; pinned + * positives have >d_inode stable, so this one avoids such problems. + * + * This can be used for in-kernel filesystem clients such as file servers. + * + * It should be called without the parent i_rwsem held, and will take + * the i_rwsem itself if necessary. If a fatal signal is pending or + * delivered, it will return %-EINTR if the lock is needed. + */ +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base) +{ + int err; + struct dentry *ret; + + err = lookup_one_common(idmap, name, base); + if (err) + return ERR_PTR(err); + + ret = lookup_dcache(name, base, 0); + if (!ret) + ret = lookup_slow_killable(name, base, 0); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_one_positive_killable); + /** * lookup_one_positive_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index b65cdfce31ce..15cb06fa0c9a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -270,26 +270,26 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { - int err; + int err = 0; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; old_cred = ovl_override_creds(rdd->dentry->d_sb); - err = down_write_killable(&dir->d_inode->i_rwsem); - if (!err) { - while (rdd->first_maybe_whiteout) { - struct ovl_cache_entry *p = - rdd->first_maybe_whiteout; - rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_idmap(path->mnt), - &QSTR_LEN(p->name, p->len), dir); - if (!IS_ERR(dentry)) { - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } + while (rdd->first_maybe_whiteout) { + struct ovl_cache_entry *p = + rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p->next_maybe_whiteout; + dentry = lookup_one_positive_killable(mnt_idmap(path->mnt), + &QSTR_LEN(p->name, p->len), + dir); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } else if (PTR_ERR(dentry) == -EINTR) { + err = -EINTR; + break; } - inode_unlock(dir->d_inode); } ovl_revert_creds(old_cred); diff --git a/include/linux/namei.h b/include/linux/namei.h index 5d085428e471..551a1a01e5e7 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -80,6 +80,9 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); +struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, + struct qstr *name, + struct dentry *base); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); -- cgit v1.2.3 From d7fb2c410240348edee7867c29b60688175dcc11 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:50 +1000 Subject: VFS: unify old_mnt_idmap and new_mnt_idmap in renamedata A rename operation can only rename within a single mount. Callers of vfs_rename() must and do ensure this is the case. So there is no point in having two mnt_idmaps in renamedata as they are always the same. Only one of them is passed to ->rename in any case. This patch replaces both with a single "mnt_idmap" and changes all callers. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/cachefiles/namei.c | 3 +-- fs/ecryptfs/inode.c | 3 +-- fs/namei.c | 17 ++++++++--------- fs/nfsd/vfs.c | 3 +-- fs/overlayfs/overlayfs.h | 3 +-- fs/smb/server/vfs.c | 3 +-- include/linux/fs.h | 6 ++---- 7 files changed, 15 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 91dfd0231877..d1edb2ac3837 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -387,10 +387,9 @@ try_again: cachefiles_io_error(cache, "Rename security error %d", ret); } else { struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, + .mnt_idmap = &nop_mnt_idmap, .old_parent = dir, .old_dentry = rep, - .new_mnt_idmap = &nop_mnt_idmap, .new_parent = cache->graveyard, .new_dentry = grave, }; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 72fbe1316ab8..abd954c6a14e 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -634,10 +634,9 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto out_lock; } - rd.old_mnt_idmap = &nop_mnt_idmap; + rd.mnt_idmap = &nop_mnt_idmap; rd.old_parent = lower_old_dir_dentry; rd.old_dentry = lower_old_dentry; - rd.new_mnt_idmap = &nop_mnt_idmap; rd.new_parent = lower_new_dir_dentry; rd.new_dentry = lower_new_dentry; rc = vfs_rename(&rd); diff --git a/fs/namei.c b/fs/namei.c index e2c2ab286bc0..180037b96956 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5077,20 +5077,20 @@ int vfs_rename(struct renamedata *rd) if (source == target) return 0; - error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); + error = may_delete(rd->mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); + error = may_create(rd->mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(rd->new_mnt_idmap, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, is_dir); else - error = may_delete(rd->new_mnt_idmap, new_dir, + error = may_delete(rd->mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) @@ -5105,13 +5105,13 @@ int vfs_rename(struct renamedata *rd) */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(rd->old_mnt_idmap, source, + error = inode_permission(rd->mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(rd->new_mnt_idmap, target, + error = inode_permission(rd->mnt_idmap, target, MAY_WRITE); if (error) return error; @@ -5179,7 +5179,7 @@ int vfs_rename(struct renamedata *rd) if (error) goto out; } - error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, + error = old_dir->i_op->rename(rd->mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; @@ -5322,10 +5322,9 @@ retry_deleg: rd.old_parent = old_path.dentry; rd.old_dentry = old_dentry; - rd.old_mnt_idmap = mnt_idmap(old_path.mnt); + rd.mnt_idmap = mnt_idmap(old_path.mnt); rd.new_parent = new_path.dentry; rd.new_dentry = new_dentry; - rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 98ab55ba3ced..5f3e99f956ca 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1943,10 +1943,9 @@ retry: goto out_dput_old; } else { struct renamedata rd = { - .old_mnt_idmap = &nop_mnt_idmap, + .mnt_idmap = &nop_mnt_idmap, .old_parent = fdentry, .old_dentry = odentry, - .new_mnt_idmap = &nop_mnt_idmap, .new_parent = tdentry, .new_dentry = ndentry, }; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index bb0d7ded8e76..4f84abaa0d68 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -361,10 +361,9 @@ static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir, { int err; struct renamedata rd = { - .old_mnt_idmap = ovl_upper_mnt_idmap(ofs), + .mnt_idmap = ovl_upper_mnt_idmap(ofs), .old_parent = olddir, .old_dentry = olddentry, - .new_mnt_idmap = ovl_upper_mnt_idmap(ofs), .new_parent = newdir, .new_dentry = newdentry, .flags = flags, diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 04539037108c..07739055ac9f 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -770,10 +770,9 @@ retry: goto out4; } - rd.old_mnt_idmap = mnt_idmap(old_path->mnt), + rd.mnt_idmap = mnt_idmap(old_path->mnt), rd.old_parent = old_parent, rd.old_dentry = old_child, - rd.new_mnt_idmap = mnt_idmap(new_path.mnt), rd.new_parent = new_path.dentry, rd.new_dentry = new_dentry, rd.flags = flags, diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..73b39e5bb9e4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2008,20 +2008,18 @@ int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, /** * struct renamedata - contains all information required for renaming - * @old_mnt_idmap: idmap of the old mount the inode was found from + * @mnt_idmap: idmap of the mount in which the rename is happening. * @old_parent: parent of source * @old_dentry: source - * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_parent: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { - struct mnt_idmap *old_mnt_idmap; + struct mnt_idmap *mnt_idmap; struct dentry *old_parent; struct dentry *old_dentry; - struct mnt_idmap *new_mnt_idmap; struct dentry *new_parent; struct dentry *new_dentry; struct inode **delegated_inode; -- cgit v1.2.3 From 76a53de6f7ff0641570364234fb4489f4d4fc8e9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:51 +1000 Subject: VFS/audit: introduce kern_path_parent() for audit audit_alloc_mark() and audit_get_nd() both need to perform a path lookup getting the parent dentry (which must exist) and the final target (following a LAST_NORM name) which sometimes doesn't need to exist. They don't need the parent to be locked, but use kern_path_locked() or kern_path_locked_negative() anyway. This is somewhat misleading to the casual reader. This patch introduces a more targeted function, kern_path_parent(), which returns not holding locks. On success the "path" will be set to the parent, which must be found, and the return value is the dentry of the target, which might be negative. This will clear the way to rename kern_path_locked() which is otherwise only used to prepare for removing something. It also allows us to remove kern_path_locked_negative(), which is transformed into the new kern_path_parent(). Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- fs/namei.c | 23 +++++++++++++++++------ include/linux/namei.h | 2 +- kernel/audit_fsnotify.c | 11 ++++++----- kernel/audit_watch.c | 3 +-- 4 files changed, 25 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 180037b96956..4017bc8641d3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2781,7 +2781,20 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return d; } -struct dentry *kern_path_locked_negative(const char *name, struct path *path) +/** + * kern_path_parent: lookup path returning parent and target + * @name: path name + * @path: path to store parent in + * + * The path @name should end with a normal component, not "." or ".." or "/". + * A lookup is performed and if successful the parent information + * is store in @parent and the dentry is returned. + * + * The dentry maybe negative, the parent will be positive. + * + * Returns: dentry or error. + */ +struct dentry *kern_path_parent(const char *name, struct path *path) { struct path parent_path __free(path_put) = {}; struct filename *filename __free(putname) = getname_kernel(name); @@ -2794,12 +2807,10 @@ struct dentry *kern_path_locked_negative(const char *name, struct path *path) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); - inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); + + d = lookup_noperm_unlocked(&last, parent_path.dentry); + if (IS_ERR(d)) return d; - } path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; diff --git a/include/linux/namei.h b/include/linux/namei.h index 551a1a01e5e7..1d5038c21c20 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -57,12 +57,12 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); +struct dentry *kern_path_parent(const char *name, struct path *parent); extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); -extern struct dentry *kern_path_locked_negative(const char *, struct path *); extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index c565fbf66ac8..b92805b317a2 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -76,17 +76,18 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa struct audit_fsnotify_mark *audit_mark; struct path path; struct dentry *dentry; - struct inode *inode; int ret; if (pathname[0] != '/' || pathname[len-1] == '/') return ERR_PTR(-EINVAL); - dentry = kern_path_locked(pathname, &path); + dentry = kern_path_parent(pathname, &path); if (IS_ERR(dentry)) return ERR_CAST(dentry); /* returning an error */ - inode = path.dentry->d_inode; - inode_unlock(inode); + if (d_really_is_negative(dentry)) { + audit_mark = ERR_PTR(-ENOENT); + goto out; + } audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { @@ -100,7 +101,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); + ret = fsnotify_add_inode_mark(&audit_mark->mark, path.dentry->d_inode, 0); if (ret < 0) { audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0ebbbe37a60f..a700e3c8925f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -349,7 +349,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { struct dentry *d; - d = kern_path_locked_negative(watch->path, parent); + d = kern_path_parent(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); @@ -359,7 +359,6 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) watch->ino = d_backing_inode(d)->i_ino; } - inode_unlock(d_backing_inode(parent->dentry)); dput(d); return 0; } -- cgit v1.2.3 From 3d18f80ce181ba27f37d0ec1c550b22acb01dd49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 22 Sep 2025 14:29:52 +1000 Subject: VFS: rename kern_path_locked() and related functions. kern_path_locked() is now only used to prepare for removing an object from the filesystem (and that is the only credible reason for wanting a positive locked dentry). Thus it corresponds to kern_path_create() and so should have a corresponding name. Unfortunately the name "kern_path_create" is somewhat misleading as it doesn't actually create anything. The recently added simple_start_creating() provides a better pattern I believe. The "start" can be matched with "end" to bracket the creating or removing. So this patch changes names: kern_path_locked -> start_removing_path kern_path_create -> start_creating_path user_path_create -> start_creating_user_path user_path_locked_at -> start_removing_user_path_at done_path_create -> end_creating_path and also introduces end_removing_path() which is identical to end_creating_path(). __start_removing_path (which was __kern_path_locked) is enhanced to call mnt_want_write() for consistency with the start_creating_path(). Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 12 ++++++ arch/powerpc/platforms/cell/spufs/syscalls.c | 4 +- drivers/base/devtmpfs.c | 22 +++++------ fs/bcachefs/fs-ioctl.c | 10 ++--- fs/init.c | 17 ++++---- fs/namei.c | 59 +++++++++++++++++----------- fs/ocfs2/refcounttree.c | 4 +- fs/smb/server/vfs.c | 8 ++-- include/linux/namei.h | 14 ++++--- kernel/bpf/inode.c | 4 +- net/unix/af_unix.c | 6 +-- 11 files changed, 93 insertions(+), 67 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f07..e0494860be6b 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1285,3 +1285,15 @@ rather than a VMA, as the VMA at this stage is not yet valid. The vm_area_desc provides the minimum required information for a filesystem to initialise state upon memory mapping of a file-backed region, and output parameters for the file system to set this state. + +--- + +**mandatory** + +Several functions are renamed: + +- kern_path_locked -> start_removing_path +- kern_path_create -> start_creating_path +- user_path_create -> start_creating_user_path +- user_path_locked_at -> start_removing_user_path_at +- done_path_create -> end_creating_path diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 157e046e6e93..ea4ba1b6ce6a 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -67,11 +67,11 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, struct dentry *dentry; int ret; - dentry = user_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_user_path(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); ret = PTR_ERR(dentry); if (!IS_ERR(dentry)) { ret = spufs_create(&path, dentry, flags, mode, neighbor); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); } return ret; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 31bfb3194b4c..9d4e46ad8352 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -176,7 +176,7 @@ static int dev_mkdir(const char *name, umode_t mode) struct dentry *dentry; struct path path; - dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -184,7 +184,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return PTR_ERR_OR_ZERO(dentry); } @@ -222,10 +222,10 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, struct path path; int err; - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); - dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); + dentry = start_creating_path(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -246,7 +246,7 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -256,7 +256,7 @@ static int dev_rmdir(const char *name) struct dentry *dentry; int err; - dentry = kern_path_locked(name, &parent); + dentry = start_removing_path(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_inode(dentry)->i_private == &thread) @@ -265,9 +265,7 @@ static int dev_rmdir(const char *name) else err = -EPERM; - dput(dentry); - inode_unlock(d_inode(parent.dentry)); - path_put(&parent); + end_removing_path(&parent, dentry); return err; } @@ -325,7 +323,7 @@ static int handle_remove(const char *nodename, struct device *dev) int deleted = 0; int err = 0; - dentry = kern_path_locked(nodename, &parent); + dentry = start_removing_path(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -349,10 +347,8 @@ static int handle_remove(const char *nodename, struct device *dev) if (!err || err == -ENOENT) deleted = 1; } - dput(dentry); - inode_unlock(d_inode(parent.dentry)); + end_removing_path(&parent, dentry); - path_put(&parent); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 4e72e654da96..43510da5e734 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -255,7 +255,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); } - dst_dentry = user_path_create(arg.dirfd, + dst_dentry = start_creating_user_path(arg.dirfd, (const char __user *)(unsigned long)arg.dst_ptr, &dst_path, lookup_flags); error = PTR_ERR_OR_ZERO(dst_dentry); @@ -314,7 +314,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, d_instantiate(dst_dentry, &inode->v); fsnotify_mkdir(dir, dst_dentry); err3: - done_path_create(&dst_path, dst_dentry); + end_creating_path(&dst_path, dst_dentry); err2: if (arg.src_ptr) path_put(&src_path); @@ -334,7 +334,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, if (arg.flags) return -EINVAL; - victim = user_path_locked_at(arg.dirfd, name, &path); + victim = start_removing_user_path_at(arg.dirfd, name, &path); if (IS_ERR(victim)) return PTR_ERR(victim); @@ -351,9 +351,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, d_invalidate(victim); } err: - inode_unlock(dir); - dput(victim); - path_put(&path); + end_removing_path(&path, victim); return ret; } diff --git a/fs/init.c b/fs/init.c index eef5124885e3..07f592ccdba8 100644 --- a/fs/init.c +++ b/fs/init.c @@ -149,7 +149,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) else if (!(S_ISBLK(mode) || S_ISCHR(mode))) return -EINVAL; - dentry = kern_path_create(AT_FDCWD, filename, &path, 0); + dentry = start_creating_path(AT_FDCWD, filename, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -158,7 +158,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -173,7 +173,7 @@ int __init init_link(const char *oldname, const char *newname) if (error) return error; - new_dentry = kern_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out; @@ -191,7 +191,7 @@ int __init init_link(const char *oldname, const char *newname) error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, NULL); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); return error; @@ -203,14 +203,14 @@ int __init init_symlink(const char *oldname, const char *newname) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, newname, &path, 0); + dentry = start_creating_path(AT_FDCWD, newname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, oldname); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } @@ -225,7 +225,8 @@ int __init init_mkdir(const char *pathname, umode_t mode) struct path path; int error; - dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); + dentry = start_creating_path(AT_FDCWD, pathname, &path, + LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); mode = mode_strip_umask(d_inode(path.dentry), mode); @@ -236,7 +237,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return error; } diff --git a/fs/namei.c b/fs/namei.c index 4017bc8641d3..92973a7a8091 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2758,7 +2758,8 @@ static int filename_parentat(int dfd, struct filename *name, } /* does lookup, returns the object with parent locked */ -static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) +static struct dentry *__start_removing_path(int dfd, struct filename *name, + struct path *path) { struct path parent_path __free(path_put) = {}; struct dentry *d; @@ -2770,15 +2771,26 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); + /* don't fail immediately if it's r/o, at least try to report other errors */ + error = mnt_want_write(parent_path.mnt); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); - if (IS_ERR(d)) { - inode_unlock(parent_path.dentry->d_inode); - return d; - } + if (IS_ERR(d)) + goto unlock; + if (error) + goto fail; path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; + +fail: + dput(d); + d = ERR_PTR(error); +unlock: + inode_unlock(parent_path.dentry->d_inode); + if (!error) + mnt_drop_write(parent_path.mnt); + return d; } /** @@ -2816,24 +2828,26 @@ struct dentry *kern_path_parent(const char *name, struct path *path) return d; } -struct dentry *kern_path_locked(const char *name, struct path *path) +struct dentry *start_removing_path(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); - struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); + struct dentry *res = __start_removing_path(AT_FDCWD, filename, path); putname(filename); return res; } -struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) +struct dentry *start_removing_user_path_at(int dfd, + const char __user *name, + struct path *path) { struct filename *filename = getname(name); - struct dentry *res = __kern_path_locked(dfd, filename, path); + struct dentry *res = __start_removing_path(dfd, filename, path); putname(filename); return res; } -EXPORT_SYMBOL(user_path_locked_at); +EXPORT_SYMBOL(start_removing_user_path_at); int kern_path(const char *name, unsigned int flags, struct path *path) { @@ -4223,8 +4237,8 @@ out: return dentry; } -struct dentry *kern_path_create(int dfd, const char *pathname, - struct path *path, unsigned int lookup_flags) +struct dentry *start_creating_path(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4232,9 +4246,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname, putname(filename); return res; } -EXPORT_SYMBOL(kern_path_create); +EXPORT_SYMBOL(start_creating_path); -void done_path_create(struct path *path, struct dentry *dentry) +void end_creating_path(struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); @@ -4242,10 +4256,11 @@ void done_path_create(struct path *path, struct dentry *dentry) mnt_drop_write(path->mnt); path_put(path); } -EXPORT_SYMBOL(done_path_create); +EXPORT_SYMBOL(end_creating_path); -inline struct dentry *user_path_create(int dfd, const char __user *pathname, - struct path *path, unsigned int lookup_flags) +inline struct dentry *start_creating_user_path( + int dfd, const char __user *pathname, + struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); @@ -4253,7 +4268,7 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, putname(filename); return res; } -EXPORT_SYMBOL(user_path_create); +EXPORT_SYMBOL(start_creating_user_path); /** * vfs_mknod - create device node or file @@ -4361,7 +4376,7 @@ retry: break; } out2: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4465,7 +4480,7 @@ retry: if (IS_ERR(dentry)) error = PTR_ERR(dentry); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4819,7 +4834,7 @@ retry: if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; @@ -4988,7 +5003,7 @@ retry: error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 8f732742b26e..267b50e8e42e 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4418,7 +4418,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, return error; } - new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0); + new_dentry = start_creating_user_path(AT_FDCWD, newname, &new_path, 0); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) { mlog_errno(error); @@ -4435,7 +4435,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, d_inode(new_path.dentry), new_dentry, preserve); out_dput: - done_path_create(&new_path, new_dentry); + end_creating_path(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 07739055ac9f..1cfa688904b2 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -196,7 +196,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) pr_err("File(%s): creation failed (err:%d)\n", name, err); } - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return err; } @@ -237,7 +237,7 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) if (!err && dentry != d) ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); - done_path_create(&path, dentry); + end_creating_path(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); return err; @@ -669,7 +669,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, ksmbd_debug(VFS, "vfs_link failed err %d\n", err); out3: - done_path_create(&newpath, dentry); + end_creating_path(&newpath, dentry); out2: path_put(&oldpath); out1: @@ -1325,7 +1325,7 @@ struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, if (!abs_name) return ERR_PTR(-ENOMEM); - dent = kern_path_create(AT_FDCWD, abs_name, path, flags); + dent = start_creating_path(AT_FDCWD, abs_name, path, flags); kfree(abs_name); return dent; } diff --git a/include/linux/namei.h b/include/linux/namei.h index 1d5038c21c20..a7800ef04e76 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -59,11 +59,15 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, extern int kern_path(const char *, unsigned, struct path *); struct dentry *kern_path_parent(const char *name, struct path *parent); -extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int); -extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); -extern void done_path_create(struct path *, struct dentry *); -extern struct dentry *kern_path_locked(const char *, struct path *); -extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); +extern struct dentry *start_creating_path(int, const char *, struct path *, unsigned int); +extern struct dentry *start_creating_user_path(int, const char __user *, struct path *, unsigned int); +extern void end_creating_path(struct path *, struct dentry *); +extern struct dentry *start_removing_path(const char *, struct path *); +extern struct dentry *start_removing_user_path_at(int , const char __user *, struct path *); +static inline void end_removing_path(struct path *path , struct dentry *dentry) +{ + end_creating_path(path, dentry); +} int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392..fadf3817a9c5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -442,7 +442,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, umode_t mode; int ret; - dentry = user_path_create(path_fd, pathname, &path, 0); + dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -471,7 +471,7 @@ static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, ret = -EPERM; } out: - done_path_create(&path, dentry); + end_creating_path(&path, dentry); return ret; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6d7c110814ff..768098dec231 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1387,7 +1387,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); + dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; @@ -1417,7 +1417,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); return 0; out_unlock: @@ -1427,7 +1427,7 @@ out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; -- cgit v1.2.3 From 1abc1b212effe920f4729353880c8e03f1d76b4b Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 31 Jul 2025 13:23:40 +0100 Subject: coresight: Appropriately disable programming clocks Some CoreSight components have programming clocks (pclk) and are enabled using clk_get() and clk_prepare_enable(). However, in many cases, these clocks are not disabled when modules exit and only released by clk_put(). To fix the issue, this commit refactors programming clock by replacing clk_get() and clk_prepare_enable() with devm_clk_get_optional_enabled() for enabling APB clock. If the "apb_pclk" clock is not found, a NULL pointer is returned, and the function proceeds to attempt enabling the "apb" clock. Since ACPI platforms rely on firmware to manage clocks, returning a NULL pointer in this case leaves clock management to the firmware rather than the driver. This effectively avoids a clock imbalance issue during module removal - where the clock could be disabled twice: once during the ACPI runtime suspend and again during the devm resource release. Callers are updated to reuse the returned error value. With the change, programming clocks are managed as resources in driver model layer, allowing clock cleanup to be handled automatically. As a result, manual cleanup operations are no longer needed and are removed from the Coresight drivers. Fixes: 73d779a03a76 ("coresight: etm4x: Change etm4_platform_driver driver for MMIO devices") Reviewed-by: Yeoreum Yun Tested-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250731-arm_cs_fix_clock_v4-v6-4-1dfe10bb3f6f@arm.com --- drivers/hwtracing/coresight/coresight-catu.c | 9 ++------- drivers/hwtracing/coresight/coresight-cpu-debug.c | 6 +----- drivers/hwtracing/coresight/coresight-ctcu-core.c | 10 ++-------- drivers/hwtracing/coresight/coresight-etm4x-core.c | 9 ++------- drivers/hwtracing/coresight/coresight-funnel.c | 6 +----- drivers/hwtracing/coresight/coresight-replicator.c | 6 +----- drivers/hwtracing/coresight/coresight-stm.c | 4 +--- drivers/hwtracing/coresight/coresight-tmc-core.c | 4 +--- drivers/hwtracing/coresight/coresight-tpiu.c | 4 +--- include/linux/coresight.h | 22 +++++++++------------- 10 files changed, 21 insertions(+), 59 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c index af2a55f0c907..4c345ff2cff1 100644 --- a/drivers/hwtracing/coresight/coresight-catu.c +++ b/drivers/hwtracing/coresight/coresight-catu.c @@ -636,7 +636,7 @@ static int catu_platform_probe(struct platform_device *pdev) drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); @@ -645,11 +645,8 @@ static int catu_platform_probe(struct platform_device *pdev) dev_set_drvdata(&pdev->dev, drvdata); ret = __catu_probe(&pdev->dev, res); pm_runtime_put(&pdev->dev); - if (ret) { + if (ret) pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); - } return ret; } @@ -663,8 +660,6 @@ static void catu_platform_remove(struct platform_device *pdev) __catu_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_PM diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c index a871d997330b..e39dfb886688 100644 --- a/drivers/hwtracing/coresight/coresight-cpu-debug.c +++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c @@ -699,7 +699,7 @@ static int debug_platform_probe(struct platform_device *pdev) drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); dev_set_drvdata(&pdev->dev, drvdata); pm_runtime_get_noresume(&pdev->dev); @@ -710,8 +710,6 @@ static int debug_platform_probe(struct platform_device *pdev) if (ret) { pm_runtime_put_noidle(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } return ret; } @@ -725,8 +723,6 @@ static void debug_platform_remove(struct platform_device *pdev) __debug_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_ACPI diff --git a/drivers/hwtracing/coresight/coresight-ctcu-core.c b/drivers/hwtracing/coresight/coresight-ctcu-core.c index c6bafc96db96..de279efe3405 100644 --- a/drivers/hwtracing/coresight/coresight-ctcu-core.c +++ b/drivers/hwtracing/coresight/coresight-ctcu-core.c @@ -209,7 +209,7 @@ static int ctcu_probe(struct platform_device *pdev) drvdata->apb_clk = coresight_get_enable_apb_pclk(dev); if (IS_ERR(drvdata->apb_clk)) - return -ENODEV; + return PTR_ERR(drvdata->apb_clk); cfgs = of_device_get_match_data(dev); if (cfgs) { @@ -233,12 +233,8 @@ static int ctcu_probe(struct platform_device *pdev) desc.access = CSDEV_ACCESS_IOMEM(base); drvdata->csdev = coresight_register(&desc); - if (IS_ERR(drvdata->csdev)) { - if (!IS_ERR_OR_NULL(drvdata->apb_clk)) - clk_put(drvdata->apb_clk); - + if (IS_ERR(drvdata->csdev)) return PTR_ERR(drvdata->csdev); - } return 0; } @@ -275,8 +271,6 @@ static void ctcu_platform_remove(struct platform_device *pdev) ctcu_remove(pdev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->apb_clk)) - clk_put(drvdata->apb_clk); } #ifdef CONFIG_PM diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c index 81f20a167e00..4b98a7bf4cb7 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x-core.c +++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c @@ -2309,14 +2309,12 @@ static int etm4_probe_platform_dev(struct platform_device *pdev) drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); if (res) { drvdata->base = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(drvdata->base)) { - clk_put(drvdata->pclk); + if (IS_ERR(drvdata->base)) return PTR_ERR(drvdata->base); - } } dev_set_drvdata(&pdev->dev, drvdata); @@ -2423,9 +2421,6 @@ static void etm4_remove_platform_dev(struct platform_device *pdev) if (drvdata) etm4_remove_dev(drvdata); pm_runtime_disable(&pdev->dev); - - if (drvdata && !IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } static const struct amba_id etm4_ids[] = { diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c index b1922dbe9292..36fc4e991458 100644 --- a/drivers/hwtracing/coresight/coresight-funnel.c +++ b/drivers/hwtracing/coresight/coresight-funnel.c @@ -240,7 +240,7 @@ static int funnel_probe(struct device *dev, struct resource *res) drvdata->pclk = coresight_get_enable_apb_pclk(dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); /* * Map the device base for dynamic-funnel, which has been @@ -284,8 +284,6 @@ static int funnel_probe(struct device *dev, struct resource *res) out_disable_clk: if (ret && !IS_ERR_OR_NULL(drvdata->atclk)) clk_disable_unprepare(drvdata->atclk); - if (ret && !IS_ERR_OR_NULL(drvdata->pclk)) - clk_disable_unprepare(drvdata->pclk); return ret; } @@ -355,8 +353,6 @@ static void funnel_platform_remove(struct platform_device *pdev) funnel_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } static const struct of_device_id funnel_match[] = { diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c index 06efd2b01a0f..6dd24eb10a94 100644 --- a/drivers/hwtracing/coresight/coresight-replicator.c +++ b/drivers/hwtracing/coresight/coresight-replicator.c @@ -247,7 +247,7 @@ static int replicator_probe(struct device *dev, struct resource *res) drvdata->pclk = coresight_get_enable_apb_pclk(dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); /* * Map the device base for dynamic-replicator, which has been @@ -296,8 +296,6 @@ static int replicator_probe(struct device *dev, struct resource *res) out_disable_clk: if (ret && !IS_ERR_OR_NULL(drvdata->atclk)) clk_disable_unprepare(drvdata->atclk); - if (ret && !IS_ERR_OR_NULL(drvdata->pclk)) - clk_disable_unprepare(drvdata->pclk); return ret; } @@ -335,8 +333,6 @@ static void replicator_platform_remove(struct platform_device *pdev) replicator_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_PM diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c index 464b0c85c3f7..f2de16e4d3b4 100644 --- a/drivers/hwtracing/coresight/coresight-stm.c +++ b/drivers/hwtracing/coresight/coresight-stm.c @@ -851,7 +851,7 @@ static int __stm_probe(struct device *dev, struct resource *res) drvdata->pclk = coresight_get_enable_apb_pclk(dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); dev_set_drvdata(dev, drvdata); base = devm_ioremap_resource(dev, res); @@ -1033,8 +1033,6 @@ static void stm_platform_remove(struct platform_device *pdev) __stm_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_ACPI diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c index 85ba037c27f7..519f22542807 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-core.c +++ b/drivers/hwtracing/coresight/coresight-tmc-core.c @@ -981,7 +981,7 @@ static int tmc_platform_probe(struct platform_device *pdev) drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); dev_set_drvdata(&pdev->dev, drvdata); pm_runtime_get_noresume(&pdev->dev); @@ -1005,8 +1005,6 @@ static void tmc_platform_remove(struct platform_device *pdev) __tmc_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_PM diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c index 3e0159288428..b2559c6fac6d 100644 --- a/drivers/hwtracing/coresight/coresight-tpiu.c +++ b/drivers/hwtracing/coresight/coresight-tpiu.c @@ -153,7 +153,7 @@ static int __tpiu_probe(struct device *dev, struct resource *res) drvdata->pclk = coresight_get_enable_apb_pclk(dev); if (IS_ERR(drvdata->pclk)) - return -ENODEV; + return PTR_ERR(drvdata->pclk); dev_set_drvdata(dev, drvdata); /* Validity for the resource is already checked by the AMBA core */ @@ -293,8 +293,6 @@ static void tpiu_platform_remove(struct platform_device *pdev) __tpiu_remove(&pdev->dev); pm_runtime_disable(&pdev->dev); - if (!IS_ERR_OR_NULL(drvdata->pclk)) - clk_put(drvdata->pclk); } #ifdef CONFIG_ACPI diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 4ac65c68bbf4..1e652e157841 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -6,6 +6,7 @@ #ifndef _LINUX_CORESIGHT_H #define _LINUX_CORESIGHT_H +#include #include #include #include @@ -480,26 +481,21 @@ static inline bool is_coresight_device(void __iomem *base) * Returns: * * clk - Clock is found and enabled - * NULL - clock is not found + * NULL - Clock is controlled by firmware (ACPI device only) * ERROR - Clock is found but failed to enable */ static inline struct clk *coresight_get_enable_apb_pclk(struct device *dev) { struct clk *pclk; - int ret; - pclk = clk_get(dev, "apb_pclk"); - if (IS_ERR(pclk)) { - pclk = clk_get(dev, "apb"); - if (IS_ERR(pclk)) - return NULL; - } + /* Firmware controls clocks for an ACPI device. */ + if (has_acpi_companion(dev)) + return NULL; + + pclk = devm_clk_get_optional_enabled(dev, "apb_pclk"); + if (!pclk) + pclk = devm_clk_get_optional_enabled(dev, "apb"); - ret = clk_prepare_enable(pclk); - if (ret) { - clk_put(pclk); - return ERR_PTR(ret); - } return pclk; } -- cgit v1.2.3 From d091c6312561821f216ced63a7ad17c946b6d335 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 31 Jul 2025 13:23:42 +0100 Subject: coresight: Avoid enable programming clock duplicately The programming clock is enabled by AMBA bus driver before a dynamic probe. As a result, a CoreSight driver may redundantly enable the same clock. To avoid this, add a check for device type and skip enabling the programming clock for AMBA devices. The returned NULL pointer will be tolerated by the drivers. Fixes: 73d779a03a76 ("coresight: etm4x: Change etm4_platform_driver driver for MMIO devices") Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Tested-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250731-arm_cs_fix_clock_v4-v6-6-1dfe10bb3f6f@arm.com --- include/linux/coresight.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 1e652e157841..bb49080ec8f9 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -481,20 +481,23 @@ static inline bool is_coresight_device(void __iomem *base) * Returns: * * clk - Clock is found and enabled - * NULL - Clock is controlled by firmware (ACPI device only) + * NULL - Clock is controlled by firmware (ACPI device only) or when managed + * by the AMBA bus driver instead * ERROR - Clock is found but failed to enable */ static inline struct clk *coresight_get_enable_apb_pclk(struct device *dev) { - struct clk *pclk; + struct clk *pclk = NULL; /* Firmware controls clocks for an ACPI device. */ if (has_acpi_companion(dev)) return NULL; - pclk = devm_clk_get_optional_enabled(dev, "apb_pclk"); - if (!pclk) - pclk = devm_clk_get_optional_enabled(dev, "apb"); + if (!dev_is_amba(dev)) { + pclk = devm_clk_get_optional_enabled(dev, "apb_pclk"); + if (!pclk) + pclk = devm_clk_get_optional_enabled(dev, "apb"); + } return pclk; } -- cgit v1.2.3 From fbe7514a7912959e384acf108931ac1bfbb16466 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 31 Jul 2025 13:23:43 +0100 Subject: coresight: Consolidate clock enabling CoreSight drivers enable pclk and atclk conditionally. For example, pclk is only enabled in the static probe, while atclk is an optional clock that it is enabled for both dynamic and static probes, if it is present. In the current CoreSight drivers, these two clocks are initialized separately. This causes complex and duplicate codes. CoreSight drivers are refined so that clocks are initialized in one go. For this purpose, this commit renames coresight_get_enable_apb_pclk() to coresight_get_enable_clocks() and encapsulates clock initialization logic: - If a clock is initialized successfully, its clock pointer is assigned to the double pointer passed as an argument. - For ACPI devices, clocks are controlled by firmware, directly bail out. - Skip enabling pclk for an AMBA device. - If atclk is not found, the corresponding double pointer is set to NULL. The function returns Success (0) to guide callers can proceed with no error. - Otherwise, an error number is returned for failures. The function became complex, move it from the header to the CoreSight core layer and the symbol is exported. Added comments for recording details. Suggested-by: Suzuki K Poulose Reviewed-by: Anshuman Khandual Reviewed-by: Yeoreum Yun Tested-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250731-arm_cs_fix_clock_v4-v6-7-1dfe10bb3f6f@arm.com --- drivers/hwtracing/coresight/coresight-catu.c | 10 ++--- drivers/hwtracing/coresight/coresight-core.c | 48 ++++++++++++++++++++++ drivers/hwtracing/coresight/coresight-cpu-debug.c | 8 ++-- drivers/hwtracing/coresight/coresight-ctcu-core.c | 8 ++-- drivers/hwtracing/coresight/coresight-etm4x-core.c | 11 ++--- drivers/hwtracing/coresight/coresight-funnel.c | 11 ++--- drivers/hwtracing/coresight/coresight-replicator.c | 11 ++--- drivers/hwtracing/coresight/coresight-stm.c | 9 ++-- drivers/hwtracing/coresight/coresight-tmc-core.c | 10 ++--- drivers/hwtracing/coresight/coresight-tpiu.c | 10 ++--- include/linux/coresight.h | 30 +------------- 11 files changed, 83 insertions(+), 83 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c index 4c345ff2cff1..0f476a0cbd74 100644 --- a/drivers/hwtracing/coresight/coresight-catu.c +++ b/drivers/hwtracing/coresight/coresight-catu.c @@ -520,9 +520,9 @@ static int __catu_probe(struct device *dev, struct resource *res) struct coresight_platform_data *pdata = NULL; void __iomem *base; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; catu_desc.name = coresight_alloc_device_name(&catu_devs, dev); if (!catu_desc.name) @@ -634,10 +634,6 @@ static int catu_platform_probe(struct platform_device *pdev) if (!drvdata) return -ENOMEM; - drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); - pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); pm_runtime_enable(&pdev->dev); diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c index 1accd7cbd54b..3267192f0c1c 100644 --- a/drivers/hwtracing/coresight/coresight-core.c +++ b/drivers/hwtracing/coresight/coresight-core.c @@ -3,6 +3,7 @@ * Copyright (c) 2012, The Linux Foundation. All rights reserved. */ +#include #include #include #include @@ -1700,6 +1701,53 @@ int coresight_etm_get_trace_id(struct coresight_device *csdev, enum cs_mode mode } EXPORT_SYMBOL_GPL(coresight_etm_get_trace_id); +/* + * Attempt to find and enable programming clock (pclk) and trace clock (atclk) + * for the given device. + * + * For ACPI devices, clocks are controlled by firmware, so bail out early in + * this case. Also, skip enabling pclk if the clock is managed by the AMBA + * bus driver instead. + * + * atclk is an optional clock, it will be only enabled when it is existed. + * Otherwise, a NULL pointer will be returned to caller. + * + * Returns: '0' on Success; Error code otherwise. + */ +int coresight_get_enable_clocks(struct device *dev, struct clk **pclk, + struct clk **atclk) +{ + WARN_ON(!pclk); + + if (has_acpi_companion(dev)) + return 0; + + if (!dev_is_amba(dev)) { + /* + * "apb_pclk" is the default clock name for an Arm Primecell + * peripheral, while "apb" is used only by the CTCU driver. + * + * For easier maintenance, CoreSight drivers should use + * "apb_pclk" as the programming clock name. + */ + *pclk = devm_clk_get_optional_enabled(dev, "apb_pclk"); + if (!*pclk) + *pclk = devm_clk_get_optional_enabled(dev, "apb"); + if (IS_ERR(*pclk)) + return PTR_ERR(*pclk); + } + + /* Initialization of atclk is skipped if it is a NULL pointer. */ + if (atclk) { + *atclk = devm_clk_get_optional_enabled(dev, "atclk"); + if (IS_ERR(*atclk)) + return PTR_ERR(*atclk); + } + + return 0; +} +EXPORT_SYMBOL_GPL(coresight_get_enable_clocks); + MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Pratik Patel "); MODULE_AUTHOR("Mathieu Poirier "); diff --git a/drivers/hwtracing/coresight/coresight-cpu-debug.c b/drivers/hwtracing/coresight/coresight-cpu-debug.c index e39dfb886688..5f6db2fb95d4 100644 --- a/drivers/hwtracing/coresight/coresight-cpu-debug.c +++ b/drivers/hwtracing/coresight/coresight-cpu-debug.c @@ -566,6 +566,10 @@ static int __debug_probe(struct device *dev, struct resource *res) void __iomem *base; int ret; + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, NULL); + if (ret) + return ret; + drvdata->cpu = coresight_get_cpu(dev); if (drvdata->cpu < 0) return drvdata->cpu; @@ -697,10 +701,6 @@ static int debug_platform_probe(struct platform_device *pdev) if (!drvdata) return -ENOMEM; - drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); - dev_set_drvdata(&pdev->dev, drvdata); pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); diff --git a/drivers/hwtracing/coresight/coresight-ctcu-core.c b/drivers/hwtracing/coresight/coresight-ctcu-core.c index de279efe3405..75b5114ef652 100644 --- a/drivers/hwtracing/coresight/coresight-ctcu-core.c +++ b/drivers/hwtracing/coresight/coresight-ctcu-core.c @@ -188,7 +188,7 @@ static int ctcu_probe(struct platform_device *pdev) const struct ctcu_config *cfgs; struct ctcu_drvdata *drvdata; void __iomem *base; - int i; + int i, ret; desc.name = coresight_alloc_device_name(&ctcu_devs, dev); if (!desc.name) @@ -207,9 +207,9 @@ static int ctcu_probe(struct platform_device *pdev) if (IS_ERR(base)) return PTR_ERR(base); - drvdata->apb_clk = coresight_get_enable_apb_pclk(dev); - if (IS_ERR(drvdata->apb_clk)) - return PTR_ERR(drvdata->apb_clk); + ret = coresight_get_enable_clocks(dev, &drvdata->apb_clk, NULL); + if (ret) + return ret; cfgs = of_device_get_match_data(dev); if (cfgs) { diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c index 4b98a7bf4cb7..020f070bf17d 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x-core.c +++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c @@ -2217,13 +2217,14 @@ static int etm4_probe(struct device *dev) struct csdev_access access = { 0 }; struct etm4_init_arg init_arg = { 0 }; struct etm4_init_arg *delayed; + int ret; if (WARN_ON(!drvdata)) return -ENOMEM; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; if (pm_save_enable == PARAM_PM_SAVE_FIRMWARE) pm_save_enable = coresight_loses_context_with_cpu(dev) ? @@ -2307,10 +2308,6 @@ static int etm4_probe_platform_dev(struct platform_device *pdev) if (!drvdata) return -ENOMEM; - drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); - if (res) { drvdata->base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(drvdata->base)) diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c index b044a4125310..02e0dc678a32 100644 --- a/drivers/hwtracing/coresight/coresight-funnel.c +++ b/drivers/hwtracing/coresight/coresight-funnel.c @@ -217,6 +217,7 @@ static int funnel_probe(struct device *dev, struct resource *res) struct coresight_platform_data *pdata = NULL; struct funnel_drvdata *drvdata; struct coresight_desc desc = { 0 }; + int ret; if (is_of_node(dev_fwnode(dev)) && of_device_is_compatible(dev->of_node, "arm,coresight-funnel")) @@ -230,13 +231,9 @@ static int funnel_probe(struct device *dev, struct resource *res) if (!drvdata) return -ENOMEM; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); - - drvdata->pclk = coresight_get_enable_apb_pclk(dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; /* * Map the device base for dynamic-funnel, which has been diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c index 9e8bd36e7a9a..f1bbd12e63e0 100644 --- a/drivers/hwtracing/coresight/coresight-replicator.c +++ b/drivers/hwtracing/coresight/coresight-replicator.c @@ -223,6 +223,7 @@ static int replicator_probe(struct device *dev, struct resource *res) struct replicator_drvdata *drvdata; struct coresight_desc desc = { 0 }; void __iomem *base; + int ret; if (is_of_node(dev_fwnode(dev)) && of_device_is_compatible(dev->of_node, "arm,coresight-replicator")) @@ -237,13 +238,9 @@ static int replicator_probe(struct device *dev, struct resource *res) if (!drvdata) return -ENOMEM; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); - - drvdata->pclk = coresight_get_enable_apb_pclk(dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; /* * Map the device base for dynamic-replicator, which has been diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c index 275d67b91dfd..23ba33ac4d2e 100644 --- a/drivers/hwtracing/coresight/coresight-stm.c +++ b/drivers/hwtracing/coresight/coresight-stm.c @@ -842,13 +842,10 @@ static int __stm_probe(struct device *dev, struct resource *res) if (!drvdata) return -ENOMEM; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; - drvdata->pclk = coresight_get_enable_apb_pclk(dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); dev_set_drvdata(dev, drvdata); base = devm_ioremap_resource(dev, res); diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c index 519f22542807..25c987e2d114 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-core.c +++ b/drivers/hwtracing/coresight/coresight-tmc-core.c @@ -779,9 +779,9 @@ static int __tmc_probe(struct device *dev, struct resource *res) struct coresight_desc desc = { 0 }; struct coresight_dev_list *dev_list = NULL; - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; ret = -ENOMEM; @@ -979,10 +979,6 @@ static int tmc_platform_probe(struct platform_device *pdev) if (!drvdata) return -ENOMEM; - drvdata->pclk = coresight_get_enable_apb_pclk(&pdev->dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); - dev_set_drvdata(&pdev->dev, drvdata); pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c index 8d6179c83e5d..5e47d761e1c4 100644 --- a/drivers/hwtracing/coresight/coresight-tpiu.c +++ b/drivers/hwtracing/coresight/coresight-tpiu.c @@ -132,6 +132,7 @@ static int __tpiu_probe(struct device *dev, struct resource *res) struct coresight_platform_data *pdata = NULL; struct tpiu_drvdata *drvdata; struct coresight_desc desc = { 0 }; + int ret; desc.name = coresight_alloc_device_name(&tpiu_devs, dev); if (!desc.name) @@ -143,13 +144,10 @@ static int __tpiu_probe(struct device *dev, struct resource *res) spin_lock_init(&drvdata->spinlock); - drvdata->atclk = devm_clk_get_optional_enabled(dev, "atclk"); - if (IS_ERR(drvdata->atclk)) - return PTR_ERR(drvdata->atclk); + ret = coresight_get_enable_clocks(dev, &drvdata->pclk, &drvdata->atclk); + if (ret) + return ret; - drvdata->pclk = coresight_get_enable_apb_pclk(dev); - if (IS_ERR(drvdata->pclk)) - return PTR_ERR(drvdata->pclk); dev_set_drvdata(dev, drvdata); /* Validity for the resource is already checked by the AMBA core */ diff --git a/include/linux/coresight.h b/include/linux/coresight.h index bb49080ec8f9..6de59ce8ef8c 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -6,7 +6,6 @@ #ifndef _LINUX_CORESIGHT_H #define _LINUX_CORESIGHT_H -#include #include #include #include @@ -475,33 +474,6 @@ static inline bool is_coresight_device(void __iomem *base) return cid == CORESIGHT_CID; } -/* - * Attempt to find and enable "APB clock" for the given device - * - * Returns: - * - * clk - Clock is found and enabled - * NULL - Clock is controlled by firmware (ACPI device only) or when managed - * by the AMBA bus driver instead - * ERROR - Clock is found but failed to enable - */ -static inline struct clk *coresight_get_enable_apb_pclk(struct device *dev) -{ - struct clk *pclk = NULL; - - /* Firmware controls clocks for an ACPI device. */ - if (has_acpi_companion(dev)) - return NULL; - - if (!dev_is_amba(dev)) { - pclk = devm_clk_get_optional_enabled(dev, "apb_pclk"); - if (!pclk) - pclk = devm_clk_get_optional_enabled(dev, "apb"); - } - - return pclk; -} - #define CORESIGHT_PIDRn(i) (0xFE0 + ((i) * 4)) static inline u32 coresight_get_pid(struct csdev_access *csa) @@ -732,4 +704,6 @@ void coresight_remove_driver(struct amba_driver *amba_drv, struct platform_driver *pdev_drv); int coresight_etm_get_trace_id(struct coresight_device *csdev, enum cs_mode mode, struct coresight_device *sink); +int coresight_get_enable_clocks(struct device *dev, struct clk **pclk, + struct clk **atclk); #endif /* _LINUX_COREISGHT_H */ -- cgit v1.2.3 From ade2105e748f85eb026d26701091213855aea633 Mon Sep 17 00:00:00 2001 From: Elijah Wright Date: Wed, 20 Aug 2025 22:39:07 -0700 Subject: tracing: Move buffer in trace_seq to end of struct TRACE_SEQ_BUFFER_SIZE is dependent on the architecture for its size. on 64-bit systems, it is 8148 bytes. forced 8-byte alignment in size_t and seq_buf means that trace_seq is 8200 bytes on 64-bit systems. moving the buffer to the end of the struct fixes the issue. there shouldn't be any side effects, i.e. pointer arithmetic on trace_seq Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250821053917.23301-1-git@elijahs.space Signed-off-by: Elijah Wright Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_seq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index a93ed5ac3226..557780fe1c77 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -21,10 +21,10 @@ (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int))) struct trace_seq { - char buffer[TRACE_SEQ_BUFFER_SIZE]; struct seq_buf seq; size_t readpos; int full; + char buffer[TRACE_SEQ_BUFFER_SIZE]; }; static inline void -- cgit v1.2.3 From 5c8fd7e2b5b0a527cf88740da122166695382a78 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:24:00 +0100 Subject: bpf: bpf task work plumbing This patch adds necessary plumbing in verifier, syscall and maps to support handling new kfunc bpf_task_work_schedule and kernel structure bpf_task_work. The idea is similar to how we already handle bpf_wq and bpf_timer. verifier changes validate calls to bpf_task_work_schedule to make sure it is safe and expected invariants hold. btf part is required to detect bpf_task_work structure inside map value and store its offset, which will be used in the next patch to calculate key and value addresses. arraymap and hashtab changes are needed to handle freeing of the bpf_task_work: run code needed to deinitialize it, for example cancel task_work callback if possible. The use of bpf_task_work and proper implementation for kfuncs are introduced in the next patch. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250923112404.668720-6-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++ include/uapi/linux/bpf.h | 4 ++ kernel/bpf/arraymap.c | 8 +-- kernel/bpf/btf.c | 7 +++ kernel/bpf/hashtab.c | 19 ++++--- kernel/bpf/helpers.c | 40 ++++++++++++++ kernel/bpf/syscall.c | 16 +++++- kernel/bpf/verifier.c | 117 ++++++++++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 4 ++ 9 files changed, 208 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dfc1a27b56d5..a2ab51fa8b0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,6 +209,7 @@ enum btf_field_type { BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), BPF_RES_SPIN_LOCK = (1 << 12), + BPF_TASK_WORK = (1 << 13), }; enum bpf_cgroup_storage_type { @@ -262,6 +263,7 @@ struct btf_record { int timer_off; int wq_off; int refcount_off; + int task_work_off; struct btf_field fields[]; }; @@ -363,6 +365,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; + case BPF_TASK_WORK: + return "bpf_task_work"; default: WARN_ON_ONCE(1); return "unknown"; @@ -401,6 +405,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); + case BPF_TASK_WORK: + return sizeof(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -433,6 +439,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); + case BPF_TASK_WORK: + return __alignof__(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -464,6 +472,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: + case BPF_TASK_WORK: break; default: WARN_ON_ONCE(1); @@ -600,6 +609,7 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_wq_cancel_and_free(void *timer); +void bpf_task_work_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -2426,6 +2436,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f3b173e48b0f..ae83d8649ef1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7436,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 26d5dda989bc..80b1765a3159 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -443,7 +443,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers_wq(struct bpf_map *map) +static void array_map_free_internal_structs(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -451,12 +451,14 @@ static void array_map_free_timers_wq(struct bpf_map *map) /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { for (i = 0; i < array->map.max_entries; i++) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); } } } @@ -795,7 +797,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers_wq, + .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c51e16bbf0c1..9f47a3aa7ff8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3490,6 +3490,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true }, { BPF_TIMER, "bpf_timer", true }, { BPF_WORKQUEUE, "bpf_wq", true }, + { BPF_TASK_WORK, "bpf_task_work", true }, { BPF_LIST_HEAD, "bpf_list_head", false }, { BPF_LIST_NODE, "bpf_list_node", false }, { BPF_RB_ROOT, "bpf_rb_root", false }, @@ -3675,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf, case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: + case BPF_TASK_WORK: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) @@ -3967,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; + rec->task_work_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { @@ -4006,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->wq_off = rec->fields[i].offset; break; + case BPF_TASK_WORK: + WARN_ON_ONCE(rec->task_work_off >= 0); + rec->task_work_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2319f8f8fa3e..c2fcd0cd51e5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -223,9 +223,12 @@ static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem * if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, htab_elem_value(elem, htab->map.key_size)); + if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) + bpf_obj_free_task_work(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); } -static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; @@ -1495,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } -static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_malloced_internal_structs(struct bpf_htab *htab) { int i; @@ -1514,16 +1517,16 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) rcu_read_unlock(); } -static void htab_map_free_timers_and_wq(struct bpf_map *map) +static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_and_wq(htab); + htab_free_malloced_internal_structs(htab); else - htab_free_prealloced_timers_and_wq(htab); + htab_free_prealloced_internal_structs(htab); } } @@ -2255,7 +2258,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2276,7 +2279,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 969f63f8ca28..7f5e528df13b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3906,8 +3906,48 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, } #endif /* CONFIG_KEYS */ +typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); + +/** + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return 0; +} + +/** + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return 0; +} + __bpf_kfunc_end_defs(); +void bpf_task_work_cancel_and_free(void *val) +{ +} + BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a3c3d26f6e2..adb05d235011 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -674,6 +674,7 @@ void btf_record_free(struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to release */ break; default: @@ -727,6 +728,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to acquire */ break; default: @@ -785,6 +787,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) bpf_wq_cancel_and_free(obj + rec->wq_off); } +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) + return; + bpf_task_work_cancel_and_free(obj + rec->task_work_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -809,6 +818,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_WORKQUEUE: bpf_wq_cancel_and_free(field_ptr); break; + case BPF_TASK_WORK: + bpf_task_work_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -1240,7 +1252,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | + BPF_TASK_WORK, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1272,6 +1285,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, break; case BPF_TIMER: case BPF_WORKQUEUE: + case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02b93a54a446..ceeb0ffe7d67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2224,10 +2224,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) - reg->map_uid = reg->id; - if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + if (btf_record_has_field(map->inner_map_meta->record, + BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { reg->map_uid = reg->id; + } } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -8461,6 +8461,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, case BPF_TIMER: field_off = map->record->timer_off; break; + case BPF_TASK_WORK: + field_off = map->record->task_work_off; + break; default: verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; @@ -8514,6 +8517,26 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_task_work_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_map_field_pointer(env, regno, BPF_TASK_WORK); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_task_work helper"); + return -EFAULT; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -10343,6 +10366,8 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); +static bool is_task_work_add_kfunc(u32 func_id); + static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); @@ -10561,7 +10586,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); + is_bpf_wq_set_callback_impl_kfunc(insn->imm) || + is_task_work_add_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -10876,6 +10902,36 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; + + /* + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_async_callback_fn = true; + callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); + return 0; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id); /* Are we currently verifying the callback for a rbtree helper that must @@ -12000,6 +12056,7 @@ enum { KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, + KF_ARG_TASK_WORK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12010,6 +12067,7 @@ BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) +BTF_ID(struct, bpf_task_work) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12058,6 +12116,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); +} + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); @@ -12145,6 +12208,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, + KF_ARG_PTR_TO_TASK_WORK, }; enum special_kfunc_type { @@ -12194,6 +12258,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, }; BTF_ID_LIST(special_kfunc_list) @@ -12262,6 +12328,14 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) + +static bool is_task_work_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; +} static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12352,6 +12426,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TASK_WORK; + if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; @@ -12695,7 +12772,8 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + is_task_work_add_kfunc(btf_id); } static bool is_bpf_throw_kfunc(struct bpf_insn *insn) @@ -13076,7 +13154,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } - if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || + reg->map_ptr->record->task_work_off >= 0)) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -13091,6 +13170,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ */ if (meta->map.ptr != reg->map_ptr || meta->map.uid != reg->map_uid) { + if (reg->map_ptr->record->task_work_off >= 0) { + verbose(env, + "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } verbose(env, "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", meta->map.uid, reg->map_uid); @@ -13129,6 +13214,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; @@ -13422,6 +13508,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TASK_WORK: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_task_work_func(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); @@ -13788,6 +13883,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_task_work_add_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_task_work_schedule_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f3b173e48b0f..ae83d8649ef1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7436,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); -- cgit v1.2.3 From 9082aae154be2d9e208b56e249cb886612f7c6cf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Aug 2025 09:19:22 -0400 Subject: sunrpc: remove dfprintk_cont() and dfprintk_rcu_cont() KERN_CONT hails from a simpler time, when SMP wasn't the norm. These days, it doesn't quite work right since another printk() can always race in between the first one and the one being "continued". Nothing calls dprintk_rcu_cont(), so just remove it. The only caller of dprintk_cont() is in nfs_commit_release_pages(). Just use a normal dprintk() there instead, since this is not SMP-safe anyway. Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Reviewed-by: Simon Horman Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 6 +++--- include/linux/sunrpc/debug.h | 24 ++---------------------- 2 files changed, 5 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 52a763b6aa3a..66acc5247435 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1818,7 +1818,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) nfs_mapping_set_error(folio, status); nfs_inode_remove_request(req); } - dprintk_cont(", error = %d\n", status); + dprintk(", error = %d\n", status); goto next; } @@ -1828,11 +1828,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a match */ if (folio) nfs_inode_remove_request(req); - dprintk_cont(" OK\n"); + dprintk(" OK\n"); goto next; } /* We have a mismatch. Write the page again */ - dprintk_cont(" mismatch\n"); + dprintk(" mismatch\n"); nfs_mark_request_dirty(req); atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index f6aeed07fe04..99a6fa4a1d6a 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -23,12 +23,8 @@ extern unsigned int nlm_debug; #define dprintk(fmt, ...) \ dfprintk(FACILITY, fmt, ##__VA_ARGS__) -#define dprintk_cont(fmt, ...) \ - dfprintk_cont(FACILITY, fmt, ##__VA_ARGS__) #define dprintk_rcu(fmt, ...) \ dfprintk_rcu(FACILITY, fmt, ##__VA_ARGS__) -#define dprintk_rcu_cont(fmt, ...) \ - dfprintk_rcu_cont(FACILITY, fmt, ##__VA_ARGS__) #undef ifdebug #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -37,29 +33,14 @@ extern unsigned int nlm_debug; # define dfprintk(fac, fmt, ...) \ do { \ ifdebug(fac) \ - printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ -} while (0) - -# define dfprintk_cont(fac, fmt, ...) \ -do { \ - ifdebug(fac) \ - printk(KERN_CONT fmt, ##__VA_ARGS__); \ + printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ } while (0) # define dfprintk_rcu(fac, fmt, ...) \ do { \ ifdebug(fac) { \ rcu_read_lock(); \ - printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ - rcu_read_unlock(); \ - } \ -} while (0) - -# define dfprintk_rcu_cont(fac, fmt, ...) \ -do { \ - ifdebug(fac) { \ - rcu_read_lock(); \ - printk(KERN_CONT fmt, ##__VA_ARGS__); \ + printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ rcu_read_unlock(); \ } \ } while (0) @@ -68,7 +49,6 @@ do { \ #else # define ifdebug(fac) if (0) # define dfprintk(fac, fmt, ...) do {} while (0) -# define dfprintk_cont(fac, fmt, ...) do {} while (0) # define dfprintk_rcu(fac, fmt, ...) do {} while (0) # define RPC_IFDEBUG(x) #endif -- cgit v1.2.3 From ec7d8e68ef0ec5c635c8f9e93cd881673445a397 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Aug 2025 09:19:23 -0400 Subject: sunrpc: add a Kconfig option to redirect dfprintk() output to trace buffer We have a lot of old dprintk() call sites that aren't going anywhere anytime soon. At the same time, turning them up is a serious burden on the host due to the console locking overhead. Add a new Kconfig option that redirects dfprintk() output to the trace buffer. This is more efficient than logging to the console and allows for proper interleaving of dprintk and static tracepoint events. Since using trace_printk() causes scary warnings to pop at boot time, this new option defaults to "n". Signed-off-by: Jeff Layton Reviewed-by: Chuck Lever Reviewed-by: Simon Horman Signed-off-by: Anna Schumaker --- include/linux/sunrpc/debug.h | 10 ++++++++-- net/sunrpc/Kconfig | 14 ++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 99a6fa4a1d6a..891f6173c951 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -30,17 +30,23 @@ extern unsigned int nlm_debug; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define ifdebug(fac) if (unlikely(rpc_debug & RPCDBG_##fac)) +# if IS_ENABLED(CONFIG_SUNRPC_DEBUG_TRACE) +# define __sunrpc_printk(fmt, ...) trace_printk(fmt, ##__VA_ARGS__) +# else +# define __sunrpc_printk(fmt, ...) printk(KERN_DEFAULT fmt, ##__VA_ARGS__) +# endif + # define dfprintk(fac, fmt, ...) \ do { \ ifdebug(fac) \ - printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ + __sunrpc_printk(fmt, ##__VA_ARGS__); \ } while (0) # define dfprintk_rcu(fac, fmt, ...) \ do { \ ifdebug(fac) { \ rcu_read_lock(); \ - printk(KERN_DEFAULT fmt, ##__VA_ARGS__); \ + __sunrpc_printk(fmt, ##__VA_ARGS__); \ rcu_read_unlock(); \ } \ } while (0) diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 2d8b67dac7b5..a570e7adf270 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -101,6 +101,20 @@ config SUNRPC_DEBUG If unsure, say Y. +config SUNRPC_DEBUG_TRACE + bool "RPC: Send dfprintk() output to the trace buffer" + depends on SUNRPC_DEBUG && TRACING + default n + help + dprintk() output can be voluminous, which can overwhelm the + kernel's logging facility as it must be sent to the console. + This option causes dprintk() output to go to the trace buffer + instead of the kernel log. + + This will cause warnings about trace_printk() being used to be + logged at boot time, so say N unless you are debugging a problem + with sunrpc-based clients or services. + config SUNRPC_XPRT_RDMA tristate "RPC-over-RDMA transport" depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS -- cgit v1.2.3 From 62c0c0e7491211969d8d1c2a9ab0e112b34664cf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Aug 2025 10:27:28 -0400 Subject: SUNRPC: Move the svc_rpcb_cleanup() call sites Clean up: because svc_rpcb_cleanup() and svc_xprt_destroy_all() are always invoked in pairs, we can deduplicate code by moving the svc_rpcb_cleanup() call sites into svc_xprt_destroy_all(). Signed-off-by: Chuck Lever Tested-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/lockd/svc.c | 6 ++---- fs/nfs/callback.c | 2 +- fs/nfsd/nfsctl.c | 2 +- fs/nfsd/nfssvc.c | 7 ++----- include/linux/sunrpc/svc_xprt.h | 3 ++- net/sunrpc/svc.c | 1 - net/sunrpc/svc_xprt.c | 7 ++++++- 7 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e80262a51884..d68afa196535 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -216,8 +216,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); return err; } @@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); } } else { pr_err("%s: no users! net=%x\n", diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 511f80878809..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc6b776fc657..63d52edcad72 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1993,7 +1993,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * remaining listeners and recreate the list. */ if (delete) - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 82b0111ac469..7057ddd7a0a8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net) #endif } - svc_xprt_destroy_all(serv, net); - /* * write_ports can create the server without actually starting - * any threads--if we get shut down before any threads are + * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ - svc_rpcb_cleanup(serv, net); - + svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 369a89aea186..fde60d4e2cd5 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -165,7 +165,8 @@ int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred); -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net); +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister); void svc_xprt_received(struct svc_xprt *xprt); void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_put(struct svc_xprt *xprt); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b1fab3a69544..9c7245d811eb 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) svc_unregister(serv, net); rpcb_put_local(net); } -EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 8b1837228799..049ab53088e9 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1102,6 +1102,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * svc_xprt_destroy_all - Destroy transports associated with @serv * @serv: RPC service to be shut down * @net: target network namespace + * @unregister: true if it is OK to unregister the destroyed xprts * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). @@ -1114,7 +1115,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister) { int delay = 0; @@ -1124,6 +1126,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) svc_clean_up_xprts(serv, net); msleep(delay++); } + + if (unregister) + svc_rpcb_cleanup(serv, net); } EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); -- cgit v1.2.3 From 301f3470273c89df3a933762b7495569f650e68b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 21 Aug 2025 12:27:00 -0400 Subject: nfs: remove NFS_WBACK_BUSY() Nothing calls this macro. Signed-off-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_page.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 9aed39abc94b..afe1d8f09d89 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -122,8 +122,6 @@ struct nfs_pageio_descriptor { /* arbitrarily selected limit to number of mirrors */ #define NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX 16 -#define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) - extern struct nfs_page *nfs_page_create_from_page(struct nfs_open_context *ctx, struct page *page, unsigned int pgbase, -- cgit v1.2.3 From c8a127596edc026e5364a7a609986dcd3999914c Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 30 Jun 2025 10:04:54 -0400 Subject: SUNRPC: Introduce xdr_set_scratch_folio() This will replace xdr_set_scratch_page() when we switch pages to folios. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 8a9ec617cf66..3ce17321689a 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -300,6 +300,19 @@ xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page) xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE); } +/** + * xdr_set_scratch_folio - Attach a scratch buffer for decoding data + * @xdr: pointer to xdr_stream struct + * @page: an anonymous folio + * + * See xdr_set_scratch_buffer(). + */ +static inline void +xdr_set_scratch_folio(struct xdr_stream *xdr, struct folio *folio) +{ + xdr_set_scratch_buffer(xdr, folio_address(folio), folio_size(folio)); +} + /** * xdr_reset_scratch_buffer - Clear scratch buffer information * @xdr: pointer to xdr_stream struct -- cgit v1.2.3 From 2f8416f23edf3b5c49ce8e08616d0071cda5c37b Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 30 Jun 2025 10:32:29 -0400 Subject: NFS: Update getacl to use xdr_set_scratch_folio() Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 4 ++-- fs/nfs/nfs4xdr.c | 2 +- include/linux/nfs_xdr.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4de3e4bd724b..a5085820ec0a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6160,7 +6160,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, } /* for decoding across pages */ - res.acl_scratch = alloc_page(GFP_KERNEL); + res.acl_scratch = folio_alloc(GFP_KERNEL, 0); if (!res.acl_scratch) goto out_free; @@ -6196,7 +6196,7 @@ out_free: while (--i >= 0) __free_page(pages[i]); if (res.acl_scratch) - __free_page(res.acl_scratch); + folio_put(res.acl_scratch); kfree(pages); return ret; } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4ee88a4c1daa..1d0e6c10f921 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6585,7 +6585,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, int status; if (res->acl_scratch != NULL) - xdr_set_scratch_page(xdr, res->acl_scratch); + xdr_set_scratch_folio(xdr, res->acl_scratch); status = decode_compound_hdr(xdr, &hdr); if (status) goto out; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ac4bff6e9913..7823d4574e29 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -862,7 +862,7 @@ struct nfs_getaclres { size_t acl_len; size_t acl_data_offset; int acl_flags; - struct page * acl_scratch; + struct folio * acl_scratch; }; struct nfs_setattrres { -- cgit v1.2.3 From c9cefd7ae86aa3463adfedca309696ba0946f9c5 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 30 Jun 2025 11:14:41 -0400 Subject: NFS: Update listxattr to use xdr_set_scratch_folio() Signed-off-by: Anna Schumaker --- fs/nfs/nfs42proc.c | 4 ++-- fs/nfs/nfs42xdr.c | 2 +- include/linux/nfs_xdr.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6a0b5871ba3b..d537fb0c230e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1514,7 +1514,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, ret = -ENOMEM; - res.scratch = alloc_page(GFP_KERNEL); + res.scratch = folio_alloc(GFP_KERNEL, 0); if (!res.scratch) goto out; @@ -1552,7 +1552,7 @@ out_free_pages: } kfree(pages); out_free_scratch: - __free_page(res.scratch); + folio_put(res.scratch); out: return ret; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 4cc915d5741d..e10d83ba835e 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1781,7 +1781,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_page(xdr, res->scratch); + xdr_set_scratch_folio(xdr, res->scratch); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 7823d4574e29..d56583572c98 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1596,7 +1596,7 @@ struct nfs42_listxattrsargs { struct nfs42_listxattrsres { struct nfs4_sequence_res seq_res; - struct page *scratch; + struct folio *scratch; void *xattr_buf; size_t xattr_len; u64 cookie; -- cgit v1.2.3 From d57e43b72bf2071caac90da323849c3983a695f0 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 30 Jun 2025 14:53:09 -0400 Subject: SUNRPC: Update svcxdr_init_decode() to call xdr_set_scratch_folio() The only snag here is that __folio_alloc_node() doesn't handle NUMA_NO_NODE, so I also need to update svc_pool_map_get_node() to return numa_mem_id() instead. I arrived at this approach by looking at what other users of __folio_alloc_node() do for this case. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/svc.h | 4 ++-- net/sunrpc/svc.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 40cbe81360ed..5506d20857c3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -196,7 +196,7 @@ struct svc_rqst { struct xdr_buf rq_arg; struct xdr_stream rq_arg_stream; struct xdr_stream rq_res_stream; - struct page *rq_scratch_page; + struct folio *rq_scratch_folio; struct xdr_buf rq_res; unsigned long rq_maxpages; /* num of entries in rq_pages */ struct page * *rq_pages; @@ -503,7 +503,7 @@ static inline void svcxdr_init_decode(struct svc_rqst *rqstp) buf->len = buf->head->iov_len + buf->page_len + buf->tail->iov_len; xdr_init_decode(xdr, buf, argv->iov_base, NULL); - xdr_set_scratch_page(xdr, rqstp->rq_scratch_page); + xdr_set_scratch_folio(xdr, rqstp->rq_scratch_folio); } /** diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 9c7245d811eb..de05ef637bdc 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -352,7 +352,7 @@ static int svc_pool_map_get_node(unsigned int pidx) if (m->mode == SVC_POOL_PERNODE) return m->pool_to[pidx]; } - return NUMA_NO_NODE; + return numa_mem_id(); } /* * Set the given thread's cpus_allowed mask so that it @@ -669,8 +669,8 @@ svc_rqst_free(struct svc_rqst *rqstp) folio_batch_release(&rqstp->rq_fbatch); kfree(rqstp->rq_bvec); svc_release_buffer(rqstp); - if (rqstp->rq_scratch_page) - put_page(rqstp->rq_scratch_page); + if (rqstp->rq_scratch_folio) + folio_put(rqstp->rq_scratch_folio); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); @@ -691,8 +691,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) rqstp->rq_server = serv; rqstp->rq_pool = pool; - rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0); - if (!rqstp->rq_scratch_page) + rqstp->rq_scratch_folio = __folio_alloc_node(GFP_KERNEL, 0, node); + if (!rqstp->rq_scratch_folio) goto out_enomem; rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); -- cgit v1.2.3 From cc6ac66f1c0946299b8f192026cff9a320aaad18 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 1 Jul 2025 10:46:50 -0400 Subject: SUNRPC: Update gssx_accept_sec_context() to use xdr_set_scratch_folio() This was the last caller of xdr_set_scratch_page(), so I remove this function while I'm at it. Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 13 ------------- net/sunrpc/auth_gss/gss_rpc_xdr.c | 8 ++++---- 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 3ce17321689a..49278749ad0c 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -287,19 +287,6 @@ xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen) xdr->scratch.iov_len = buflen; } -/** - * xdr_set_scratch_page - Attach a scratch buffer for decoding data - * @xdr: pointer to xdr_stream struct - * @page: an anonymous page - * - * See xdr_set_scratch_buffer(). - */ -static inline void -xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page) -{ - xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE); -} - /** * xdr_set_scratch_folio - Attach a scratch buffer for decoding data * @xdr: pointer to xdr_stream struct diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index cb32ab9a8395..7d2cdc2bd374 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -794,12 +794,12 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, struct gssx_res_accept_sec_context *res = data; u32 value_follows; int err; - struct page *scratch; + struct folio *scratch; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (!scratch) return -ENOMEM; - xdr_set_scratch_page(xdr, scratch); + xdr_set_scratch_folio(xdr, scratch); /* res->status */ err = gssx_dec_status(xdr, &res->status); @@ -844,6 +844,6 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, err = gssx_dec_option_array(xdr, &res->options); out_free: - __free_page(scratch); + folio_put(scratch); return err; } -- cgit v1.2.3 From 24bbd533f596a4544e17579e9f622918680e7bff Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Sep 2025 12:48:14 -0400 Subject: filemap: Add a helper for filesystems implementing dropbehind Add a helper to allow filesystems to attempt to free the 'dropbehind' folio. Signed-off-by: Trond Myklebust Link: https://lore.kernel.org/all/5588a06f6d5a2cf6746828e2d36e7ada668b1739.1745381692.git.trond.myklebust@hammerspace.com/ Reviewed-by: Mike Snitzer Signed-off-by: Anna Schumaker --- include/linux/pagemap.h | 1 + mm/filemap.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 12a12dae727d..201b7c6f6441 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1221,6 +1221,7 @@ void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); +void folio_end_dropbehind(struct folio *folio); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb); diff --git a/mm/filemap.c b/mm/filemap.c index 751838ef05e5..66cec689bec4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1608,7 +1608,7 @@ static void filemap_end_dropbehind(struct folio *folio) * completes. Do that now. If we fail, it's likely because of a big folio - * just reset dropbehind for that case and latter completions should invalidate. */ -static void filemap_end_dropbehind_write(struct folio *folio) +void folio_end_dropbehind(struct folio *folio) { if (!folio_test_dropbehind(folio)) return; @@ -1625,6 +1625,7 @@ static void filemap_end_dropbehind_write(struct folio *folio) folio_unlock(folio); } } +EXPORT_SYMBOL_GPL(folio_end_dropbehind); /** * folio_end_writeback - End writeback against a folio. @@ -1660,7 +1661,7 @@ void folio_end_writeback(struct folio *folio) if (__folio_end_writeback(folio)) folio_wake_bit(folio, PG_writeback); - filemap_end_dropbehind_write(folio); + folio_end_dropbehind(folio); acct_reclaim_writeback(folio); folio_put(folio); } -- cgit v1.2.3 From 010054a530aa266ee1711dfbe23fc06b6eb0fa48 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 6 Sep 2025 12:48:15 -0400 Subject: filemap: Add a version of folio_end_writeback that ignores dropbehind Filesystems such as NFS may need to defer dropbehind until after their 2-stage writes are done. This adds a helper folio_end_writeback_no_dropbehind() that allows them to release the writeback flag without immediately dropping the folio. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/pagemap.h | 1 + mm/filemap.c | 29 +++++++++++++++++++++++------ 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 201b7c6f6441..5b26465358ce 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1221,6 +1221,7 @@ void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); void end_page_writeback(struct page *page); void folio_end_writeback(struct folio *folio); +void folio_end_writeback_no_dropbehind(struct folio *folio); void folio_end_dropbehind(struct folio *folio); void folio_wait_stable(struct folio *folio); void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn); diff --git a/mm/filemap.c b/mm/filemap.c index 66cec689bec4..d12bbb4c9d8a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1628,14 +1628,15 @@ void folio_end_dropbehind(struct folio *folio) EXPORT_SYMBOL_GPL(folio_end_dropbehind); /** - * folio_end_writeback - End writeback against a folio. + * folio_end_writeback_no_dropbehind - End writeback against a folio. * @folio: The folio. * * The folio must actually be under writeback. + * This call is intended for filesystems that need to defer dropbehind. * * Context: May be called from process or interrupt context. */ -void folio_end_writeback(struct folio *folio) +void folio_end_writeback_no_dropbehind(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); @@ -1651,6 +1652,25 @@ void folio_end_writeback(struct folio *folio) folio_rotate_reclaimable(folio); } + if (__folio_end_writeback(folio)) + folio_wake_bit(folio, PG_writeback); + + acct_reclaim_writeback(folio); +} +EXPORT_SYMBOL_GPL(folio_end_writeback_no_dropbehind); + +/** + * folio_end_writeback - End writeback against a folio. + * @folio: The folio. + * + * The folio must actually be under writeback. + * + * Context: May be called from process or interrupt context. + */ +void folio_end_writeback(struct folio *folio) +{ + VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); + /* * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. @@ -1658,11 +1678,8 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake_bit(). */ folio_get(folio); - if (__folio_end_writeback(folio)) - folio_wake_bit(folio, PG_writeback); - + folio_end_writeback_no_dropbehind(folio); folio_end_dropbehind(folio); - acct_reclaim_writeback(folio); folio_put(folio); } EXPORT_SYMBOL(folio_end_writeback); -- cgit v1.2.3 From a91ae3c89311648cbaa9b46b860e4f76004a24b8 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Sep 2025 11:01:49 +0000 Subject: bpf, x86: Add support for signed arena loads Currently, signed load instructions into arena memory are unsupported. The compiler is free to generate these, and on GCC-14 we see a corresponding error when it happens. The hurdle in supporting them is deciding which unused opcode to use to mark them for the JIT's own consumption. After much thinking, it appears 0xc0 / BPF_NOSPEC can be combined with load instructions to identify signed arena loads. Use this to recognize and JIT them appropriately, and remove the verifier side limitation on the program if the JIT supports them. Co-developed-by: Puranjay Mohan Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Puranjay Mohan Link: https://lore.kernel.org/r/20250923110157.18326-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 5 +++++ arch/riscv/net/bpf_jit_comp64.c | 5 +++++ arch/s390/net/bpf_jit_comp.c | 5 +++++ arch/x86/net/bpf_jit_comp.c | 40 +++++++++++++++++++++++++++++++++++++--- include/linux/filter.h | 3 +++ kernel/bpf/verifier.c | 11 ++++++++--- 6 files changed, 63 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index e36261c63952..796938b535cd 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -3064,6 +3064,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) if (!bpf_atomic_is_load_store(insn) && !cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) return false; + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } return true; } diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 14d7aab61fcb..83672373d026 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -2066,6 +2066,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) case BPF_STX | BPF_ATOMIC | BPF_DW: if (insn->imm == BPF_CMPXCHG) return rv_ext_enabled(ZACAS); + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 8b57d8532f36..cf461d76e9da 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2967,6 +2967,11 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) case BPF_STX | BPF_ATOMIC | BPF_DW: if (bpf_atomic_is_load_store(insn)) return false; + break; + case BPF_LDX | BPF_MEMSX | BPF_B: + case BPF_LDX | BPF_MEMSX | BPF_H: + case BPF_LDX | BPF_MEMSX | BPF_W: + return false; } return true; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 8d34a9400a5e..fc13306af15f 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1152,11 +1152,38 @@ static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 i *pprog = prog; } +static void emit_ldsx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off) +{ + u8 *prog = *pprog; + + switch (size) { + case BPF_B: + /* movsx rax, byte ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBE); + break; + case BPF_H: + /* movsx rax, word ptr [rax + r12 + off] */ + EMIT3(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x0F, 0xBF); + break; + case BPF_W: + /* movsx rax, dword ptr [rax + r12 + off] */ + EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x63); + break; + } + emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off); + *pprog = prog; +} + static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off); } +static void emit_ldsx_r12(u8 **prog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + emit_ldsx_index(prog, size, dst_reg, src_reg, X86_REG_R12, off); +} + /* STX: *(u8*)(dst_reg + off) = src_reg */ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) { @@ -2109,15 +2136,22 @@ st: if (is_imm8(insn->off)) case BPF_LDX | BPF_PROBE_MEM32 | BPF_H: case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_B: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_H: + case BPF_LDX | BPF_PROBE_MEM32SX | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_B: case BPF_STX | BPF_PROBE_MEM32 | BPF_H: case BPF_STX | BPF_PROBE_MEM32 | BPF_W: case BPF_STX | BPF_PROBE_MEM32 | BPF_DW: start_of_ldx = prog; - if (BPF_CLASS(insn->code) == BPF_LDX) - emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); - else + if (BPF_CLASS(insn->code) == BPF_LDX) { + if (BPF_MODE(insn->code) == BPF_PROBE_MEM32SX) + emit_ldsx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + else + emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } else { emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + } populate_extable: { struct exception_table_entry *ex; diff --git a/include/linux/filter.h b/include/linux/filter.h index 4241a885975f..f5c859b8131a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -78,6 +78,9 @@ struct ctl_table_header; /* unused opcode to mark special atomic instruction */ #define BPF_PROBE_ATOMIC 0xe0 +/* unused opcode to mark special ldsx instruction. Same as BPF_NOSPEC */ +#define BPF_PROBE_MEM32SX 0xc0 + /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ceeb0ffe7d67..164199237176 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21379,10 +21379,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; case PTR_TO_ARENA: if (BPF_MODE(insn->code) == BPF_MEMSX) { - verbose(env, "sign extending loads from arena are not supported yet\n"); - return -EOPNOTSUPP; + if (!bpf_jit_supports_insn(insn, true)) { + verbose(env, "sign extending loads from arena are not supported yet\n"); + return -EOPNOTSUPP; + } + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32SX | BPF_SIZE(insn->code); + } else { + insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); } - insn->code = BPF_CLASS(insn->code) | BPF_PROBE_MEM32 | BPF_SIZE(insn->code); env->prog->aux->num_exentries++; continue; default: @@ -21588,6 +21592,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEM32 || + BPF_MODE(insn->code) == BPF_PROBE_MEM32SX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) num_exentries++; if ((BPF_CLASS(insn->code) == BPF_STX || -- cgit v1.2.3 From edf005fa274a0c224e550a52726aa7a426384e36 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 23 Sep 2025 09:03:26 -1000 Subject: sched_ext: Improve SCX_KF_DISPATCH comment The comment for SCX_KF_DISPATCH was incomplete and didn't explain that ops.dispatch() may temporarily release the rq lock, allowing ENQUEUE and SELECT_CPU operations to be nested inside DISPATCH contexts. Update the comment to clarify this nesting behavior and provide better context for when these operations can occur within dispatch. Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 7047101dbf58..d82b7a9b0658 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -108,7 +108,11 @@ enum scx_kf_mask { SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ - /* ops.dequeue (in REST) may be nested inside DISPATCH */ + /* + * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and + * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be + * nested inside DISPATCH. + */ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ -- cgit v1.2.3 From ccb4f5d91ec43c05ba165ccfc7ed889eb9cdfd05 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Fri, 19 Sep 2025 12:41:09 +0800 Subject: bpf: Allow union argument in trampoline based programs Currently, functions with 'union' arguments cannot be traced with fentry/fexit: bpftrace -e 'fentry:release_pages { exit(); }' -v The function release_pages arg0 type UNION is unsupported. The type of the 'release_pages' arg0 is defined as: typedef union { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages; } release_pages_arg __attribute__ ((__transparent_union__)); This patch relaxes the restriction by allowing function arguments of type 'union' to be traced in verifier. Reviewed-by: Amery Hung Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250919044110.23729-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/btf.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a2ab51fa8b0a..ba3a3be7eb2a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1128,7 +1128,7 @@ struct bpf_prog_offload { */ #define MAX_BPF_FUNC_REG_ARGS 5 -/* The argument is a structure. */ +/* The argument is a structure or a union. */ #define BTF_FMODEL_STRUCT_ARG BIT(0) /* The argument is signed. */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 9f47a3aa7ff8..0de8fc8a0e0b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6751,7 +6751,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* skip modifiers */ while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); - if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) + if (btf_type_is_small_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t)) /* accessing a scalar */ return true; if (!btf_type_is_ptr(t)) { @@ -7323,7 +7323,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, if (btf_type_is_ptr(t)) /* kernel size of pointer. Not BPF's size of pointer*/ return sizeof(void *); - if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t)) + if (btf_type_is_int(t) || btf_is_any_enum(t) || btf_type_is_struct(t)) return t->size; return -EINVAL; } @@ -7332,7 +7332,7 @@ static u8 __get_type_fmodel_flags(const struct btf_type *t) { u8 flags = 0; - if (__btf_type_is_struct(t)) + if (btf_type_is_struct(t)) flags |= BTF_FMODEL_STRUCT_ARG; if (btf_type_is_signed_int(t)) flags |= BTF_FMODEL_SIGNED_ARG; @@ -7373,7 +7373,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, return -EINVAL; } ret = __get_type_size(btf, func->type, &t); - if (ret < 0 || __btf_type_is_struct(t)) { + if (ret < 0 || btf_type_is_struct(t)) { bpf_log(log, "The function %s return type %s is unsupported.\n", tname, btf_type_str(t)); -- cgit v1.2.3 From b650bf0977d34c52befb31a9fa711534e11b220f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Sep 2025 10:42:40 +0000 Subject: udp: remove busylock and add per NUMA queues busylock was protecting UDP sockets against packet floods, but unfortunately was not protecting the host itself. Under stress, many cpus could spin while acquiring the busylock, and NIC had to drop packets. Or packets would be dropped in cpu backlog if RPS/RFS were in place. This patch replaces the busylock by intermediate lockless queues. (One queue per NUMA node). This means that fewer number of cpus have to acquire the UDP receive queue lock. Most of the cpus can either: - immediately drop the packet. - or queue it in their NUMA aware lockless queue. Then one of the cpu is chosen to process this lockless queue in a batch. The batch only contains packets that were cooked on the same NUMA node, thus with very limited latency impact. Tested: DDOS targeting a victim UDP socket, on a platform with 6 NUMA nodes (Intel(R) Xeon(R) 6985P-C) Before: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 1004179 0.0 Udp6InErrors 3117 0.0 Udp6RcvbufErrors 3117 0.0 After: nstat -n ; sleep 1 ; nstat | grep Udp Udp6InDatagrams 1116633 0.0 Udp6InErrors 14197275 0.0 Udp6RcvbufErrors 14197275 0.0 We can see this host can now proces 14.2 M more packets per second while under attack, and the victim socket can receive 11 % more packets. I used a small bpftrace program measuring time (in us) spent in __udp_enqueue_schedule_skb(). Before: @udp_enqueue_us[398]: [0] 24901 |@@@ | [1] 63512 |@@@@@@@@@ | [2, 4) 344827 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [4, 8) 244673 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [8, 16) 54022 |@@@@@@@@ | [16, 32) 222134 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [32, 64) 232042 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [64, 128) 4219 | | [128, 256) 188 | | After: @udp_enqueue_us[398]: [0] 5608855 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [1] 1111277 |@@@@@@@@@@ | [2, 4) 501439 |@@@@ | [4, 8) 102921 | | [8, 16) 29895 | | [16, 32) 43500 | | [32, 64) 31552 | | [64, 128) 979 | | [128, 256) 13 | | Note that the remaining bottleneck for this platform is in udp_drops_inc() because we limited struct numa_drop_counters to only two nodes so far. Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Reviewed-by: Willem de Bruijn Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250922104240.2182559-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 9 +++- include/net/udp.h | 11 ++++- net/ipv4/udp.c | 117 +++++++++++++++++++++++++++++++--------------------- net/ipv6/udp.c | 5 ++- 4 files changed, 91 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index e554890c4415..58795688a186 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -44,6 +44,12 @@ enum { UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */ }; +/* per NUMA structure for lockless producer usage. */ +struct udp_prod_queue { + struct llist_head ll_root ____cacheline_aligned_in_smp; + atomic_t rmem_alloc; +}; + struct udp_sock { /* inet_sock has to be the first member */ struct inet_sock inet; @@ -90,6 +96,8 @@ struct udp_sock { struct sk_buff *skb, int nhoff); + struct udp_prod_queue *udp_prod_queue; + /* udp_recvmsg try to use this before splicing sk_receive_queue */ struct sk_buff_head reader_queue ____cacheline_aligned_in_smp; @@ -109,7 +117,6 @@ struct udp_sock { */ struct hlist_node tunnel_list; struct numa_drop_counters drop_counters; - spinlock_t busylock ____cacheline_aligned_in_smp; }; #define udp_test_bit(nr, sk) \ diff --git a/include/net/udp.h b/include/net/udp.h index 059a0cee5f55..cffedb3e40f2 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -284,16 +284,23 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); -static inline void udp_lib_init_sock(struct sock *sk) +static inline int udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); sk->sk_drop_counters = &up->drop_counters; - spin_lock_init(&up->busylock); skb_queue_head_init(&up->reader_queue); INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); + + up->udp_prod_queue = kcalloc(nr_node_ids, sizeof(*up->udp_prod_queue), + GFP_KERNEL); + if (!up->udp_prod_queue) + return -ENOMEM; + for (int i = 0; i < nr_node_ids; i++) + init_llist_head(&up->udp_prod_queue[i].ll_root); + return 0; } static inline void udp_drops_inc(struct sock *sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 85cfc32eb2cc..95241093b7f0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1685,25 +1685,6 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } -/* Idea of busylocks is to let producers grab an extra spinlock - * to relieve pressure on the receive_queue spinlock shared by consumer. - * Under flood, this means that only one producer can be in line - * trying to acquire the receive_queue spinlock. - */ -static spinlock_t *busylock_acquire(struct sock *sk) -{ - spinlock_t *busy = &udp_sk(sk)->busylock; - - spin_lock(busy); - return busy; -} - -static void busylock_release(spinlock_t *busy) -{ - if (busy) - spin_unlock(busy); -} - static int udp_rmem_schedule(struct sock *sk, int size) { int delta; @@ -1718,14 +1699,24 @@ static int udp_rmem_schedule(struct sock *sk, int size) int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; + struct udp_prod_queue *udp_prod_queue; + struct sk_buff *next, *to_drop = NULL; + struct llist_node *ll_list; unsigned int rmem, rcvbuf; - spinlock_t *busy = NULL; int size, err = -ENOMEM; + int total_size = 0; + int q_size = 0; + int dropcount; + int nb = 0; rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); size = skb->truesize; + udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()]; + + rmem += atomic_read(&udp_prod_queue->rmem_alloc); + /* Immediately drop when the receive queue is full. * Cast to unsigned int performs the boundary check for INT_MAX. */ @@ -1747,45 +1738,77 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rmem > (rcvbuf >> 1)) { skb_condense(skb); size = skb->truesize; - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); - if (rmem > rcvbuf) - goto uncharge_drop; - busy = busylock_acquire(sk); - } else { - atomic_add(size, &sk->sk_rmem_alloc); } udp_set_dev_scratch(skb); + atomic_add(size, &udp_prod_queue->rmem_alloc); + + if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root)) + return 0; + + dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0; + spin_lock(&list->lock); - err = udp_rmem_schedule(sk, size); - if (err) { - spin_unlock(&list->lock); - goto uncharge_drop; - } - sk_forward_alloc_add(sk, -size); + ll_list = llist_del_all(&udp_prod_queue->ll_root); - /* no need to setup a destructor, we will explicitly release the - * forward allocated memory on dequeue - */ - sock_skb_set_dropcount(sk, skb); + ll_list = llist_reverse_order(ll_list); + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + size = udp_skb_truesize(skb); + total_size += size; + err = udp_rmem_schedule(sk, size); + if (unlikely(err)) { + /* Free the skbs outside of locked section. */ + skb->next = to_drop; + to_drop = skb; + continue; + } + + q_size += size; + sk_forward_alloc_add(sk, -size); + + /* no need to setup a destructor, we will explicitly release the + * forward allocated memory on dequeue + */ + SOCK_SKB_CB(skb)->dropcount = dropcount; + nb++; + __skb_queue_tail(list, skb); + } + + atomic_add(q_size, &sk->sk_rmem_alloc); - __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); + if (!sock_flag(sk, SOCK_DEAD)) { + /* Multiple threads might be blocked in recvmsg(), + * using prepare_to_wait_exclusive(). + */ + while (nb) { + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + nb--; + } + } + + if (unlikely(to_drop)) { + for (nb = 0; to_drop != NULL; nb++) { + skb = to_drop; + to_drop = skb->next; + skb_mark_not_on_list(skb); + /* TODO: update SNMP values. */ + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); + } + numa_drop_add(&udp_sk(sk)->drop_counters, nb); + } - busylock_release(busy); - return 0; + atomic_sub(total_size, &udp_prod_queue->rmem_alloc); -uncharge_drop: - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + return 0; drop: udp_drops_inc(sk); - busylock_release(busy); return err; } EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); @@ -1803,6 +1826,7 @@ void udp_destruct_common(struct sock *sk) kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); + kfree(up->udp_prod_queue); } EXPORT_IPV6_MOD_GPL(udp_destruct_common); @@ -1814,10 +1838,11 @@ static void udp_destruct_sock(struct sock *sk) int udp_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udp_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 9f4d340d1e3a..813a2ba75824 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -67,10 +67,11 @@ static void udpv6_destruct_sock(struct sock *sk) int udpv6_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udpv6_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } INDIRECT_CALLABLE_SCOPE -- cgit v1.2.3 From 092263a03105539b8dfe74c59be4c6cce1304d5f Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 20 Sep 2025 23:34:07 +0200 Subject: net: phy: stop exporting phy_driver_register phy_driver_register() isn't used outside phy_device.c any longer, so we can stop exporting it. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/dff44b83-4a85-4fff-bf6b-f12efd97b56e@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 4 ++-- include/linux/phy.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index c82c1997147b..01269b865f5e 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3544,7 +3544,8 @@ static int phy_remove(struct device *dev) * @new_driver: new phy_driver to register * @owner: module owning this PHY */ -int phy_driver_register(struct phy_driver *new_driver, struct module *owner) +static int phy_driver_register(struct phy_driver *new_driver, + struct module *owner) { int retval; @@ -3587,7 +3588,6 @@ int phy_driver_register(struct phy_driver *new_driver, struct module *owner) return 0; } -EXPORT_SYMBOL(phy_driver_register); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner) diff --git a/include/linux/phy.h b/include/linux/phy.h index d09fc42e61f3..b377dfaa6801 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2032,7 +2032,6 @@ static inline int phy_read_status(struct phy_device *phydev) void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); -int phy_driver_register(struct phy_driver *new_driver, struct module *owner); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); void phy_error(struct phy_device *phydev); -- cgit v1.2.3 From 6043819e707cefb1c9e59d6e431dcfa735c4f975 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 22 Sep 2025 10:11:32 +0300 Subject: net/mlx5: fs, fix UAF in flow counter release Fix a kernel trace [1] caused by releasing an HWS action of a local flow counter in mlx5_cmd_hws_delete_fte(), where the HWS action refcount and mutex were not initialized and the counter struct could already be freed when deleting the rule. Fix it by adding the missing initializations and adding refcount for the local flow counter struct. [1] Kernel log: Call Trace: dump_stack_lvl+0x34/0x48 mlx5_fs_put_hws_action.part.0.cold+0x21/0x94 [mlx5_core] mlx5_fc_put_hws_action+0x96/0xad [mlx5_core] mlx5_fs_destroy_fs_actions+0x8b/0x152 [mlx5_core] mlx5_cmd_hws_delete_fte+0x5a/0xa0 [mlx5_core] del_hw_fte+0x1ce/0x260 [mlx5_core] mlx5_del_flow_rules+0x12d/0x240 [mlx5_core] ? ttwu_queue_wakelist+0xf4/0x110 mlx5_ib_destroy_flow+0x103/0x1b0 [mlx5_ib] uverbs_free_flow+0x20/0x50 [ib_uverbs] destroy_hw_idr_uobject+0x1b/0x50 [ib_uverbs] uverbs_destroy_uobject+0x34/0x1a0 [ib_uverbs] uobj_destroy+0x3c/0x80 [ib_uverbs] ib_uverbs_run_method+0x23e/0x360 [ib_uverbs] ? uverbs_finalize_object+0x60/0x60 [ib_uverbs] ib_uverbs_cmd_verbs+0x14f/0x2c0 [ib_uverbs] ? do_tty_write+0x1a9/0x270 ? file_tty_write.constprop.0+0x98/0xc0 ? new_sync_write+0xfc/0x190 ib_uverbs_ioctl+0xd7/0x160 [ib_uverbs] __x64_sys_ioctl+0x87/0xc0 do_syscall_64+0x59/0x90 Fixes: b581f4266928 ("net/mlx5: fs, manage flow counters HWS action sharing by refcount") Signed-off-by: Moshe Shemesh Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758525094-816583-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 1 + .../net/ethernet/mellanox/mlx5/core/fs_counters.c | 25 +++++++++++++++++++--- .../mellanox/mlx5/core/steering/hws/fs_hws_pools.c | 8 ++++++- include/linux/mlx5/fs.h | 2 ++ 5 files changed, 33 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index db552c012b4f..80245c38dbad 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -663,7 +663,7 @@ static void del_sw_hw_rule(struct fs_node *node) BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) | BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS); fte->act_dests.action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT; - mlx5_fc_local_destroy(rule->dest_attr.counter); + mlx5_fc_local_put(rule->dest_attr.counter); goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 500826229b0b..e6a95b310b55 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -343,6 +343,7 @@ struct mlx5_fc { enum mlx5_fc_type type; struct mlx5_fc_bulk *bulk; struct mlx5_fc_cache cache; + refcount_t fc_local_refcount; /* last{packets,bytes} are used for calculating deltas since last reading. */ u64 lastpackets; u64 lastbytes; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index 492775d3d193..83001eda3884 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -562,17 +562,36 @@ mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size) counter->id = counter_id; fc_bulk->base_id = counter_id - offset; fc_bulk->fs_bulk.bulk_len = bulk_size; + refcount_set(&fc_bulk->hws_data.hws_action_refcount, 0); + mutex_init(&fc_bulk->hws_data.lock); counter->bulk = fc_bulk; + refcount_set(&counter->fc_local_refcount, 1); return counter; } EXPORT_SYMBOL(mlx5_fc_local_create); void mlx5_fc_local_destroy(struct mlx5_fc *counter) { - if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) - return; - kfree(counter->bulk); kfree(counter); } EXPORT_SYMBOL(mlx5_fc_local_destroy); + +void mlx5_fc_local_get(struct mlx5_fc *counter) +{ + if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) + return; + + refcount_inc(&counter->fc_local_refcount); +} + +void mlx5_fc_local_put(struct mlx5_fc *counter) +{ + if (!counter || counter->type != MLX5_FC_TYPE_LOCAL) + return; + + if (!refcount_dec_and_test(&counter->fc_local_refcount)) + return; + + mlx5_fc_local_destroy(counter); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c index f1ecdba74e1f..839d71bd4216 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws_pools.c @@ -407,15 +407,21 @@ struct mlx5hws_action *mlx5_fc_get_hws_action(struct mlx5hws_context *ctx, { struct mlx5_fs_hws_create_action_ctx create_ctx; struct mlx5_fc_bulk *fc_bulk = counter->bulk; + struct mlx5hws_action *hws_action; create_ctx.hws_ctx = ctx; create_ctx.id = fc_bulk->base_id; create_ctx.actions_type = MLX5HWS_ACTION_TYP_CTR; - return mlx5_fs_get_hws_action(&fc_bulk->hws_data, &create_ctx); + mlx5_fc_local_get(counter); + hws_action = mlx5_fs_get_hws_action(&fc_bulk->hws_data, &create_ctx); + if (!hws_action) + mlx5_fc_local_put(counter); + return hws_action; } void mlx5_fc_put_hws_action(struct mlx5_fc *counter) { mlx5_fs_put_hws_action(&counter->bulk->hws_data); + mlx5_fc_local_put(counter); } diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 86055d55836d..6ac76a0c3827 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -308,6 +308,8 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); struct mlx5_fc *mlx5_fc_local_create(u32 counter_id, u32 offset, u32 bulk_size); void mlx5_fc_local_destroy(struct mlx5_fc *counter); +void mlx5_fc_local_get(struct mlx5_fc *counter); +void mlx5_fc_local_put(struct mlx5_fc *counter); u64 mlx5_fc_query_lastuse(struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); -- cgit v1.2.3 From 7384893d970ea114952aef54ad7e3d7d2ca82d4f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 16 Sep 2025 23:52:56 +0200 Subject: bpf: Allow uprobe program to change context registers Currently uprobe (BPF_PROG_TYPE_KPROBE) program can't write to the context registers data. While this makes sense for kprobe attachments, for uprobe attachment it might make sense to be able to change user space registers to alter application execution. Since uprobe and kprobe programs share the same type (BPF_PROG_TYPE_KPROBE), we can't deny write access to context during the program load. We need to check on it during program attachment to see if it's going to be kprobe or uprobe. Storing the program's write attempt to context and checking on it during the attachment. Acked-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20250916215301.664963-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/events/core.c | 4 ++++ kernel/trace/bpf_trace.c | 9 +++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ba3a3be7eb2a..ea2ed6771cc6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1639,6 +1639,7 @@ struct bpf_prog_aux { bool priv_stack_requested; bool changes_pkt_data; bool might_sleep; + bool kprobe_write_ctx; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; diff --git a/kernel/events/core.c b/kernel/events/core.c index 820127536e62..1d354778dcd4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11232,6 +11232,10 @@ static int __perf_event_set_bpf_prog(struct perf_event *event, if (prog->kprobe_override && !is_kprobe) return -EINVAL; + /* Writing to context allowed only for uprobes. */ + if (prog->aux->kprobe_write_ctx && !is_uprobe) + return -EINVAL; + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f2360579658e..8f23f5273bab 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1338,8 +1338,6 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type { if (off < 0 || off >= sizeof(struct pt_regs)) return false; - if (type != BPF_READ) - return false; if (off % size != 0) return false; /* @@ -1349,6 +1347,9 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type if (off + size > sizeof(struct pt_regs)) return false; + if (type == BPF_WRITE) + prog->aux->kprobe_write_ctx = true; + return true; } @@ -2735,6 +2736,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr if (!is_kprobe_multi(prog)) return -EINVAL; + /* Writing to context is not allowed for kprobes. */ + if (prog->aux->kprobe_write_ctx) + return -EINVAL; + flags = attr->link_create.kprobe_multi.flags; if (flags & ~BPF_F_KPROBE_MULTI_RETURN) return -EINVAL; -- cgit v1.2.3 From d4680a11e14c7baf683cb8453d91d71d2e0b9d3e Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 24 Sep 2025 10:14:26 +0200 Subject: bpf: Mark kfuncs as __noclone Some distributions (e.g., CachyOS) support building the kernel with -O3, but doing so may break kfuncs, resulting in their symbols not being properly exported. In fact, with gcc -O3, some kfuncs may be optimized away despite being annotated as noinline. This happens because gcc can still clone the function during IPA optimizations, e.g., by duplicating or inlining it into callers, and then dropping the standalone symbol. This breaks BTF ID resolution since resolve_btfids relies on the presence of a global symbol for each kfunc. Currently, this is not an issue for upstream, because we don't allow building the kernel with -O3, but it may be safer to address it anyway, to prevent potential issues in the future if compilers become more aggressive with optimizations. Therefore, add __noclone to __bpf_kfunc to ensure kfuncs are never cloned and remain distinct, globally visible symbols, regardless of the optimization level. Fixes: 57e7c169cd6af ("bpf: Add __bpf_kfunc tag for marking kernel functions as kfuncs") Acked-by: David Vernet Acked-by: Yonghong Song Signed-off-by: Andrea Righi Link: https://lore.kernel.org/r/20250924081426.156934-1-arighi@nvidia.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 9eda6b113f9b..f06976ffb63f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -86,7 +86,7 @@ * as to avoid issues such as the compiler inlining or eliding either a static * kfunc, or a global kfunc in an LTO build. */ -#define __bpf_kfunc __used __retain noinline +#define __bpf_kfunc __used __retain __noclone noinline #define __bpf_kfunc_start_defs() \ __diag_push(); \ -- cgit v1.2.3 From 64f89f6e1f2b7f8f203d218a8c8d90922e1d4048 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 17 Sep 2025 10:54:05 +0200 Subject: gpio: generic: rename BGPIOF_ flags to GPIO_GENERIC_ Make the flags passed to gpio_generic_chip_init() use the same prefix as the rest of the modernized generic GPIO chip API. Link: https://lore.kernel.org/r/20250917-gpio-generic-flags-v1-1-69f51fee8c89@linaro.org Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-amdpt.c | 2 +- drivers/gpio/gpio-brcmstb.c | 2 +- drivers/gpio/gpio-cadence.c | 2 +- drivers/gpio/gpio-ge.c | 2 +- drivers/gpio/gpio-grgpio.c | 2 +- drivers/gpio/gpio-hisi.c | 3 ++- drivers/gpio/gpio-hlwd.c | 2 +- drivers/gpio/gpio-ixp4xx.c | 2 +- drivers/gpio/gpio-mmio.c | 28 ++++++++++++++-------------- drivers/gpio/gpio-mpc8xxx.c | 4 ++-- drivers/gpio/gpio-mt7621.c | 2 +- drivers/gpio/gpio-mxc.c | 2 +- drivers/gpio/gpio-rda.c | 2 +- drivers/gpio/gpio-realtek-otto.c | 2 +- drivers/gpio/gpio-sifive.c | 2 +- drivers/gpio/gpio-spacemit-k1.c | 3 ++- drivers/gpio/gpio-vf610.c | 4 ++-- drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c | 2 +- drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c | 2 +- drivers/pinctrl/nuvoton/pinctrl-wpcm450.c | 2 +- drivers/pinctrl/stm32/pinctrl-stm32-hdp.c | 2 +- include/linux/gpio/driver.h | 18 +++++++++--------- 22 files changed, 47 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-amdpt.c b/drivers/gpio/gpio-amdpt.c index bbaf42307bc3..8458a6949c65 100644 --- a/drivers/gpio/gpio-amdpt.c +++ b/drivers/gpio/gpio-amdpt.c @@ -94,7 +94,7 @@ static int pt_gpio_probe(struct platform_device *pdev) .dat = pt_gpio->reg_base + PT_INPUTDATA_REG, .set = pt_gpio->reg_base + PT_OUTPUTDATA_REG, .dirout = pt_gpio->reg_base + PT_DIRECTION_REG, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&pt_gpio->chip, &config); diff --git a/drivers/gpio/gpio-brcmstb.c b/drivers/gpio/gpio-brcmstb.c index be3ff916e134..f40c9472588b 100644 --- a/drivers/gpio/gpio-brcmstb.c +++ b/drivers/gpio/gpio-brcmstb.c @@ -630,7 +630,7 @@ static int brcmstb_gpio_probe(struct platform_device *pdev) * else leave I/O in little endian mode. */ #if defined(CONFIG_MIPS) && defined(__BIG_ENDIAN) - flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; #endif of_property_for_each_u32(np, "brcm,gpio-bank-widths", bank_width) { diff --git a/drivers/gpio/gpio-cadence.c b/drivers/gpio/gpio-cadence.c index c647953521c7..b75734ca22dd 100644 --- a/drivers/gpio/gpio-cadence.c +++ b/drivers/gpio/gpio-cadence.c @@ -181,7 +181,7 @@ static int cdns_gpio_probe(struct platform_device *pdev) config.dat = cgpio->regs + CDNS_GPIO_INPUT_VALUE; config.set = cgpio->regs + CDNS_GPIO_OUTPUT_VALUE; config.dirin = cgpio->regs + CDNS_GPIO_DIRECTION_MODE; - config.flags = BGPIOF_READ_OUTPUT_REG_SET; + config.flags = GPIO_GENERIC_READ_OUTPUT_REG_SET; ret = gpio_generic_chip_init(&cgpio->gen_gc, &config); if (ret) { diff --git a/drivers/gpio/gpio-ge.c b/drivers/gpio/gpio-ge.c index b5cbf27b8f44..66bdff36eb61 100644 --- a/drivers/gpio/gpio-ge.c +++ b/drivers/gpio/gpio-ge.c @@ -73,7 +73,7 @@ static int __init gef_gpio_probe(struct platform_device *pdev) .dat = regs + GEF_GPIO_IN, .set = regs + GEF_GPIO_OUT, .dirin = regs + GEF_GPIO_DIRECT, - .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, }; ret = gpio_generic_chip_init(chip, &config); diff --git a/drivers/gpio/gpio-grgpio.c b/drivers/gpio/gpio-grgpio.c index 5930f4c6f2b5..0c0f97fa14fc 100644 --- a/drivers/gpio/gpio-grgpio.c +++ b/drivers/gpio/gpio-grgpio.c @@ -359,7 +359,7 @@ static int grgpio_probe(struct platform_device *ofdev) .dat = regs + GRGPIO_DATA, .set = regs + GRGPIO_OUTPUT, .dirout = regs + GRGPIO_DIR, - .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, }; gc = &priv->chip.gc; diff --git a/drivers/gpio/gpio-hisi.c b/drivers/gpio/gpio-hisi.c index d8c4ab02ceae..d26298c8351b 100644 --- a/drivers/gpio/gpio-hisi.c +++ b/drivers/gpio/gpio-hisi.c @@ -300,7 +300,8 @@ static int hisi_gpio_probe(struct platform_device *pdev) .clr = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DR_CLR_WX, .dirout = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_SET_WX, .dirin = hisi_gpio->reg_base + HISI_GPIO_SWPORT_DDR_CLR_WX, - .flags = BGPIOF_NO_SET_ON_INPUT | BGPIOF_UNREADABLE_REG_DIR, + .flags = GPIO_GENERIC_NO_SET_ON_INPUT | + GPIO_GENERIC_UNREADABLE_REG_DIR, }; ret = gpio_generic_chip_init(&hisi_gpio->chip, &config); diff --git a/drivers/gpio/gpio-hlwd.c b/drivers/gpio/gpio-hlwd.c index a395f87436ac..043ce5ef3b07 100644 --- a/drivers/gpio/gpio-hlwd.c +++ b/drivers/gpio/gpio-hlwd.c @@ -253,7 +253,7 @@ static int hlwd_gpio_probe(struct platform_device *pdev) .dat = hlwd->regs + HW_GPIOB_IN, .set = hlwd->regs + HW_GPIOB_OUT, .dirout = hlwd->regs + HW_GPIOB_DIR, - .flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER, + .flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER, }; res = gpio_generic_chip_init(&hlwd->gpioc, &config); diff --git a/drivers/gpio/gpio-ixp4xx.c b/drivers/gpio/gpio-ixp4xx.c index 8a3b6b192288..f34d87869c8b 100644 --- a/drivers/gpio/gpio-ixp4xx.c +++ b/drivers/gpio/gpio-ixp4xx.c @@ -289,7 +289,7 @@ static int ixp4xx_gpio_probe(struct platform_device *pdev) * for big endian. */ #if defined(CONFIG_CPU_BIG_ENDIAN) - flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; #else flags = 0; #endif diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index a3df14d672a9..7d6dd36cf1ae 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -554,7 +554,7 @@ static int bgpio_setup_io(struct gpio_generic_chip *chip, chip->reg_set = cfg->set; gc->set = bgpio_set_set; gc->set_multiple = bgpio_set_multiple_set; - } else if (cfg->flags & BGPIOF_NO_OUTPUT) { + } else if (cfg->flags & GPIO_GENERIC_NO_OUTPUT) { gc->set = bgpio_set_none; gc->set_multiple = NULL; } else { @@ -562,8 +562,8 @@ static int bgpio_setup_io(struct gpio_generic_chip *chip, gc->set_multiple = bgpio_set_multiple; } - if (!(cfg->flags & BGPIOF_UNREADABLE_REG_SET) && - (cfg->flags & BGPIOF_READ_OUTPUT_REG_SET)) { + if (!(cfg->flags & GPIO_GENERIC_UNREADABLE_REG_SET) && + (cfg->flags & GPIO_GENERIC_READ_OUTPUT_REG_SET)) { gc->get = bgpio_get_set; if (!chip->be_bits) gc->get_multiple = bgpio_get_set_multiple; @@ -593,19 +593,19 @@ static int bgpio_setup_direction(struct gpio_generic_chip *chip, if (cfg->dirout || cfg->dirin) { chip->reg_dir_out = cfg->dirout; chip->reg_dir_in = cfg->dirin; - if (cfg->flags & BGPIOF_NO_SET_ON_INPUT) + if (cfg->flags & GPIO_GENERIC_NO_SET_ON_INPUT) gc->direction_output = bgpio_dir_out_dir_first; else gc->direction_output = bgpio_dir_out_val_first; gc->direction_input = bgpio_dir_in; gc->get_direction = bgpio_get_dir; } else { - if (cfg->flags & BGPIOF_NO_OUTPUT) + if (cfg->flags & GPIO_GENERIC_NO_OUTPUT) gc->direction_output = bgpio_dir_out_err; else gc->direction_output = bgpio_simple_dir_out; - if (cfg->flags & BGPIOF_NO_INPUT) + if (cfg->flags & GPIO_GENERIC_NO_INPUT) gc->direction_input = bgpio_dir_in_err; else gc->direction_input = bgpio_simple_dir_in; @@ -654,7 +654,7 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, gc->label = dev_name(dev); gc->base = -1; gc->request = bgpio_request; - chip->be_bits = !!(flags & BGPIOF_BIG_ENDIAN); + chip->be_bits = !!(flags & GPIO_GENERIC_BIG_ENDIAN); ret = gpiochip_get_ngpios(gc, dev); if (ret) @@ -665,7 +665,7 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, return ret; ret = bgpio_setup_accessors(dev, chip, - flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER); + flags & GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER); if (ret) return ret; @@ -673,7 +673,7 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, if (ret) return ret; - if (flags & BGPIOF_PINCTRL_BACKEND) { + if (flags & GPIO_GENERIC_PINCTRL_BACKEND) { chip->pinctrl = true; /* Currently this callback is only used for pincontrol */ gc->free = gpiochip_generic_free; @@ -681,17 +681,17 @@ int gpio_generic_chip_init(struct gpio_generic_chip *chip, chip->sdata = chip->read_reg(chip->reg_dat); if (gc->set == bgpio_set_set && - !(flags & BGPIOF_UNREADABLE_REG_SET)) + !(flags & GPIO_GENERIC_UNREADABLE_REG_SET)) chip->sdata = chip->read_reg(chip->reg_set); - if (flags & BGPIOF_UNREADABLE_REG_DIR) + if (flags & GPIO_GENERIC_UNREADABLE_REG_DIR) chip->dir_unreadable = true; /* * Inspect hardware to find initial direction setting. */ if ((chip->reg_dir_out || chip->reg_dir_in) && - !(flags & BGPIOF_UNREADABLE_REG_DIR)) { + !(flags & GPIO_GENERIC_UNREADABLE_REG_DIR)) { if (chip->reg_dir_out) chip->sdir = chip->read_reg(chip->reg_dir_out); else if (chip->reg_dir_in) @@ -787,10 +787,10 @@ static int bgpio_pdev_probe(struct platform_device *pdev) return -ENOMEM; if (device_is_big_endian(dev)) - flags |= BGPIOF_BIG_ENDIAN_BYTE_ORDER; + flags |= GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; if (device_property_read_bool(dev, "no-output")) - flags |= BGPIOF_NO_OUTPUT; + flags |= GPIO_GENERIC_NO_OUTPUT; config = (struct gpio_generic_chip_config) { .dev = dev, diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index a2a83afb41bb..bfe828734ee1 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -350,13 +350,13 @@ static int mpc8xxx_probe(struct platform_device *pdev) .sz = 4, .dat = mpc8xxx_gc->regs + GPIO_DAT, .dirout = mpc8xxx_gc->regs + GPIO_DIR, - .flags = BGPIOF_BIG_ENDIAN + .flags = GPIO_GENERIC_BIG_ENDIAN }; if (device_property_read_bool(dev, "little-endian")) { dev_dbg(dev, "GPIO registers are LITTLE endian\n"); } else { - config.flags |= BGPIOF_BIG_ENDIAN_BYTE_ORDER; + config.flags |= GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; dev_dbg(dev, "GPIO registers are BIG endian\n"); } diff --git a/drivers/gpio/gpio-mt7621.c b/drivers/gpio/gpio-mt7621.c index e7bb9b2cd6cf..91230be51587 100644 --- a/drivers/gpio/gpio-mt7621.c +++ b/drivers/gpio/gpio-mt7621.c @@ -242,7 +242,7 @@ mediatek_gpio_bank_probe(struct device *dev, int bank) .set = set, .clr = ctrl, .dirout = diro, - .flags = BGPIOF_NO_SET_ON_INPUT, + .flags = GPIO_GENERIC_NO_SET_ON_INPUT, }; ret = gpio_generic_chip_init(&rg->chip, &config); diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c index 433cbadc3a4c..52060b3ec745 100644 --- a/drivers/gpio/gpio-mxc.c +++ b/drivers/gpio/gpio-mxc.c @@ -481,7 +481,7 @@ static int mxc_gpio_probe(struct platform_device *pdev) config.dat = port->base + GPIO_PSR; config.set = port->base + GPIO_DR; config.dirout = port->base + GPIO_GDIR; - config.flags = BGPIOF_READ_OUTPUT_REG_SET; + config.flags = GPIO_GENERIC_READ_OUTPUT_REG_SET; err = gpio_generic_chip_init(&port->gen_gc, &config); if (err) diff --git a/drivers/gpio/gpio-rda.c b/drivers/gpio/gpio-rda.c index fb479d13eb01..7bbc6f0ce4c8 100644 --- a/drivers/gpio/gpio-rda.c +++ b/drivers/gpio/gpio-rda.c @@ -245,7 +245,7 @@ static int rda_gpio_probe(struct platform_device *pdev) .clr = rda_gpio->base + RDA_GPIO_CLR, .dirout = rda_gpio->base + RDA_GPIO_OEN_SET_OUT, .dirin = rda_gpio->base + RDA_GPIO_OEN_SET_IN, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&rda_gpio->chip, &config); diff --git a/drivers/gpio/gpio-realtek-otto.c b/drivers/gpio/gpio-realtek-otto.c index 37b4f73771e6..de527f4fc6c2 100644 --- a/drivers/gpio/gpio-realtek-otto.c +++ b/drivers/gpio/gpio-realtek-otto.c @@ -395,7 +395,7 @@ static int realtek_gpio_probe(struct platform_device *pdev) ctrl->bank_write = realtek_gpio_bank_write; ctrl->line_imr_pos = realtek_gpio_line_imr_pos; } else { - gen_gc_flags = BGPIOF_BIG_ENDIAN_BYTE_ORDER; + gen_gc_flags = GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER; ctrl->bank_read = realtek_gpio_bank_read_swapped; ctrl->bank_write = realtek_gpio_bank_write_swapped; ctrl->line_imr_pos = realtek_gpio_line_imr_pos_swapped; diff --git a/drivers/gpio/gpio-sifive.c b/drivers/gpio/gpio-sifive.c index 2ced87ffd3bb..94ef2efbd14f 100644 --- a/drivers/gpio/gpio-sifive.c +++ b/drivers/gpio/gpio-sifive.c @@ -223,7 +223,7 @@ static int sifive_gpio_probe(struct platform_device *pdev) .set = chip->base + SIFIVE_GPIO_OUTPUT_VAL, .dirout = chip->base + SIFIVE_GPIO_OUTPUT_EN, .dirin = chip->base + SIFIVE_GPIO_INPUT_EN, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&chip->gen_gc, &config); diff --git a/drivers/gpio/gpio-spacemit-k1.c b/drivers/gpio/gpio-spacemit-k1.c index a0af23f73281..eb66a15c002f 100644 --- a/drivers/gpio/gpio-spacemit-k1.c +++ b/drivers/gpio/gpio-spacemit-k1.c @@ -197,7 +197,8 @@ static int spacemit_gpio_add_bank(struct spacemit_gpio *sg, .clr = clr, .dirout = dirout, .dirin = dirin, - .flags = BGPIOF_UNREADABLE_REG_SET | BGPIOF_UNREADABLE_REG_DIR, + .flags = GPIO_GENERIC_UNREADABLE_REG_SET | + GPIO_GENERIC_UNREADABLE_REG_DIR, }; /* This registers 32 GPIO lines per bank */ diff --git a/drivers/gpio/gpio-vf610.c b/drivers/gpio/gpio-vf610.c index f3590db72b14..aa8586d8a787 100644 --- a/drivers/gpio/gpio-vf610.c +++ b/drivers/gpio/gpio-vf610.c @@ -296,14 +296,14 @@ static int vf610_gpio_probe(struct platform_device *pdev) } gc = &port->chip.gc; - flags = BGPIOF_PINCTRL_BACKEND; + flags = GPIO_GENERIC_PINCTRL_BACKEND; /* * We only read the output register for current value on output * lines if the direction register is available so we can switch * direction. */ if (port->sdata->have_paddr) - flags |= BGPIOF_READ_OUTPUT_REG_SET; + flags |= GPIO_GENERIC_READ_OUTPUT_REG_SET; config = (struct gpio_generic_chip_config) { .dev = dev, diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c index c2ca71ebb973..10765f19b48b 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm7xx.c @@ -1842,7 +1842,7 @@ static int npcm7xx_gpio_of(struct npcm7xx_pinctrl *pctrl) .dat = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DIN, .set = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_DOUT, .dirin = pctrl->gpio_bank[id].base + NPCM7XX_GP_N_IEM, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&pctrl->gpio_bank[id].chip, &config); diff --git a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c index 0f155a685bba..1005b464a469 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c +++ b/drivers/pinctrl/nuvoton/pinctrl-npcm8xx.c @@ -2335,7 +2335,7 @@ static int npcm8xx_gpio_fw(struct npcm8xx_pinctrl *pctrl) .dat = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DIN, .set = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_DOUT, .dirin = pctrl->gpio_bank[id].base + NPCM8XX_GP_N_IEM, - .flags = BGPIOF_READ_OUTPUT_REG_SET, + .flags = GPIO_GENERIC_READ_OUTPUT_REG_SET, }; ret = gpio_generic_chip_init(&pctrl->gpio_bank[id].chip, &config); diff --git a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c index 4dd8a3daa83e..c575949e42e6 100644 --- a/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c +++ b/drivers/pinctrl/nuvoton/pinctrl-wpcm450.c @@ -1061,7 +1061,7 @@ static int wpcm450_gpio_register(struct platform_device *pdev, set = pctrl->gpio_base + bank->dataout; dirout = pctrl->gpio_base + bank->cfg0; } else { - flags = BGPIOF_NO_OUTPUT; + flags = GPIO_GENERIC_NO_OUTPUT; } config = (typeof(config)){ diff --git a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c index dea49b9aabf2..971959a75b0c 100644 --- a/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c +++ b/drivers/pinctrl/stm32/pinctrl-stm32-hdp.c @@ -648,7 +648,7 @@ static int stm32_hdp_probe(struct platform_device *pdev) .dat = hdp->base + HDP_GPOVAL, .set = hdp->base + HDP_GPOSET, .clr = hdp->base + HDP_GPOCLR, - .flags = BGPIOF_NO_INPUT, + .flags = GPIO_GENERIC_NO_INPUT, }; err = gpio_generic_chip_init(&hdp->gpio_chip, &config); diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 9b14fd20f13e..e62622e42cad 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -684,15 +684,15 @@ int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -#define BGPIOF_BIG_ENDIAN BIT(0) -#define BGPIOF_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ -#define BGPIOF_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ -#define BGPIOF_BIG_ENDIAN_BYTE_ORDER BIT(3) -#define BGPIOF_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ -#define BGPIOF_NO_OUTPUT BIT(5) /* only input */ -#define BGPIOF_NO_SET_ON_INPUT BIT(6) -#define BGPIOF_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ -#define BGPIOF_NO_INPUT BIT(8) /* only output */ +#define GPIO_GENERIC_BIG_ENDIAN BIT(0) +#define GPIO_GENERIC_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ +#define GPIO_GENERIC_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ +#define GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER BIT(3) +#define GPIO_GENERIC_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ +#define GPIO_GENERIC_NO_OUTPUT BIT(5) /* only input */ +#define GPIO_GENERIC_NO_SET_ON_INPUT BIT(6) +#define GPIO_GENERIC_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ +#define GPIO_GENERIC_NO_INPUT BIT(8) /* only output */ #ifdef CONFIG_GPIOLIB_IRQCHIP int gpiochip_irqchip_add_domain(struct gpio_chip *gc, -- cgit v1.2.3 From 2235b26c1b25daf253748acff501af3ea85faaa8 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 17 Sep 2025 10:54:06 +0200 Subject: gpio: generic: move GPIO_GENERIC_ flags to the correct header These flags are specific to gpio-mmio and belong in linux/gpio/generic.h so move them there. Link: https://lore.kernel.org/r/20250917-gpio-generic-flags-v1-2-69f51fee8c89@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 10 ---------- include/linux/gpio/generic.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index e62622e42cad..fabe2baf7b50 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -684,16 +684,6 @@ int gpiochip_populate_parent_fwspec_fourcell(struct gpio_chip *gc, #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ -#define GPIO_GENERIC_BIG_ENDIAN BIT(0) -#define GPIO_GENERIC_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ -#define GPIO_GENERIC_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ -#define GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER BIT(3) -#define GPIO_GENERIC_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ -#define GPIO_GENERIC_NO_OUTPUT BIT(5) /* only input */ -#define GPIO_GENERIC_NO_SET_ON_INPUT BIT(6) -#define GPIO_GENERIC_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ -#define GPIO_GENERIC_NO_INPUT BIT(8) /* only output */ - #ifdef CONFIG_GPIOLIB_IRQCHIP int gpiochip_irqchip_add_domain(struct gpio_chip *gc, struct irq_domain *domain); diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index 162430d96660..ff566dc9c3cb 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -9,6 +9,16 @@ struct device; +#define GPIO_GENERIC_BIG_ENDIAN BIT(0) +#define GPIO_GENERIC_UNREADABLE_REG_SET BIT(1) /* reg_set is unreadable */ +#define GPIO_GENERIC_UNREADABLE_REG_DIR BIT(2) /* reg_dir is unreadable */ +#define GPIO_GENERIC_BIG_ENDIAN_BYTE_ORDER BIT(3) +#define GPIO_GENERIC_READ_OUTPUT_REG_SET BIT(4) /* reg_set stores output value */ +#define GPIO_GENERIC_NO_OUTPUT BIT(5) /* only input */ +#define GPIO_GENERIC_NO_SET_ON_INPUT BIT(6) +#define GPIO_GENERIC_PINCTRL_BACKEND BIT(7) /* Call pinctrl direction setters */ +#define GPIO_GENERIC_NO_INPUT BIT(8) /* only output */ + /** * struct gpio_generic_chip_config - Generic GPIO chip configuration data * @dev: Parent device of the new GPIO chip (compulsory). -- cgit v1.2.3 From 23049938605bda390f875ce20e0704252c2e5c3d Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:37:10 +0900 Subject: can: populate the minimum and maximum MTU values By populating: net_device->min_mtu and net_device->max_mtu the net core infrastructure will automatically: 1. validate that the user's inputs are in range. 2. report those min and max MTU values through the netlink interface. Add can_set_default_mtu() which sets the default mtu value as well as the minimum and maximum values. The logic for the default mtu value remains unchanged: - CANFD_MTU if the device has a static CAN_CTRLMODE_FD. - CAN_MTU otherwise. Call can_set_default_mtu() each time the CAN_CTRLMODE_FD is modified. This will guarantee that the MTU value is always consistent with the control mode flags. With this, the checks done in can_change_mtu() become fully redundant and will be removed in an upcoming change and it is now possible to confirm the minimum and maximum MTU values on a physical CAN interface by doing: $ ip --details link show can0 The virtual interfaces (vcan and vxcan) are not impacted by this change. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-can-fix-mtu-v3-3-581bde113f52@kernel.org [mkl: squashed https://patch.msgid.link/20250924143644.17622-2-mailhol@kernel.org] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 20 ++++++++++++++++++-- drivers/net/can/dev/netlink.c | 9 ++++----- include/linux/can/dev.h | 1 + 3 files changed, 23 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 02bfed37cc93..befdeb4c54c2 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -240,6 +240,8 @@ void can_setup(struct net_device *dev) { dev->type = ARPHRD_CAN; dev->mtu = CAN_MTU; + dev->min_mtu = CAN_MTU; + dev->max_mtu = CAN_MTU; dev->hard_header_len = 0; dev->addr_len = 0; dev->tx_queue_len = 10; @@ -309,6 +311,21 @@ void free_candev(struct net_device *dev) } EXPORT_SYMBOL_GPL(free_candev); +void can_set_default_mtu(struct net_device *dev) +{ + struct can_priv *priv = netdev_priv(dev); + + if (priv->ctrlmode & CAN_CTRLMODE_FD) { + dev->mtu = CANFD_MTU; + dev->min_mtu = CANFD_MTU; + dev->max_mtu = CANFD_MTU; + } else { + dev->mtu = CAN_MTU; + dev->min_mtu = CAN_MTU; + dev->max_mtu = CAN_MTU; + } +} + /* changing MTU and control mode for CAN/CANFD devices */ int can_change_mtu(struct net_device *dev, int new_mtu) { @@ -361,8 +378,7 @@ int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) priv->ctrlmode = static_mode; /* override MTU which was set by default in can_setup()? */ - if (static_mode & CAN_CTRLMODE_FD) - dev->mtu = CANFD_MTU; + can_set_default_mtu(dev); return 0; } diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index d9f6ab3efb97..248f607e3864 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -223,17 +223,16 @@ static int can_changelink(struct net_device *dev, struct nlattr *tb[], priv->ctrlmode &= ~cm->mask; priv->ctrlmode |= maskedflags; - /* CAN_CTRLMODE_FD can only be set when driver supports FD */ - if (priv->ctrlmode & CAN_CTRLMODE_FD) { - dev->mtu = CANFD_MTU; - } else { - dev->mtu = CAN_MTU; + /* Wipe potential leftovers from previous CAN FD config */ + if (!(priv->ctrlmode & CAN_CTRLMODE_FD)) { memset(&priv->fd.data_bittiming, 0, sizeof(priv->fd.data_bittiming)); priv->ctrlmode &= ~CAN_CTRLMODE_FD_TDC_MASK; memset(&priv->fd.tdc, 0, sizeof(priv->fd.tdc)); } + can_set_default_mtu(dev); + fd_tdc_flag_provided = cm->mask & CAN_CTRLMODE_FD_TDC_MASK; /* CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually * exclusive: make sure to turn the other one off diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 5dc58360c2d7..3354f70ed2c6 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -166,6 +166,7 @@ struct can_priv *safe_candev_priv(struct net_device *dev); int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); +void can_set_default_mtu(struct net_device *dev); int can_change_mtu(struct net_device *dev, int new_mtu); int __must_check can_set_static_ctrlmode(struct net_device *dev, u32 static_mode); -- cgit v1.2.3 From cc470fcf1d59f9d6186810ea5253da49a4f85f83 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:26 +0900 Subject: can: dev: move struct data_bittiming_params to linux/can/bittiming.h In commit b803c4a4f788 ("can: dev: add struct data_bittiming_params to group FD parameters"), struct data_bittiming_params was put into linux/can/dev.h. This structure being a collection of bittiming parameters, on second thought, bittiming.h is actually a better location. This way, users of struct data_bittiming_params will not have to forcefully include linux/can/dev.h thus removing some complexity and reducing the risk of circular dependencies in headers. Move struct data_bittiming_params from linux/can/dev.h to linux/can/bittiming.h. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-1-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 11 +++++++++++ include/linux/can/dev.h | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 5dfdbb63b1d5..6572ec1712ca 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -114,6 +114,17 @@ struct can_tdc_const { u32 tdcf_max; }; +struct data_bittiming_params { + const struct can_bittiming_const *data_bittiming_const; + struct can_bittiming data_bittiming; + const struct can_tdc_const *tdc_const; + struct can_tdc tdc; + const u32 *data_bitrate_const; + unsigned int data_bitrate_const_cnt; + int (*do_set_data_bittiming)(struct net_device *dev); + int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); +}; + #ifdef CONFIG_CAN_CALC_BITTIMING int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc, struct netlink_ext_ack *extack); diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 3354f70ed2c6..c2fe956ab776 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -38,17 +38,6 @@ enum can_termination_gpio { CAN_TERMINATION_GPIO_MAX, }; -struct data_bittiming_params { - const struct can_bittiming_const *data_bittiming_const; - struct can_bittiming data_bittiming; - const struct can_tdc_const *tdc_const; - struct can_tdc tdc; - const u32 *data_bitrate_const; - unsigned int data_bitrate_const_cnt; - int (*do_set_data_bittiming)(struct net_device *dev); - int (*do_get_auto_tdcv)(const struct net_device *dev, u32 *tdcv); -}; - /* * CAN common private data */ -- cgit v1.2.3 From 7208385df7846d30e29febc6c6280cb32e91ee82 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:27 +0900 Subject: can: dev: make can_get_relative_tdco() FD agnostic and move it to bittiming.h can_get_relative_tdco() needs to access can_priv->fd making it specific to CAN FD. Change the function parameter from struct can_priv to struct data_bittiming_params. This way, the function becomes CAN FD agnostic and can be reused later on for the CAN XL TDC. Now that we dropped the dependency on struct can_priv, also move can_get_relative_tdco() back to bittiming.h where it was meant to belong to. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-2-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 29 +++++++++++++++++++++++++++++ include/linux/can/dev.h | 29 ----------------------------- 2 files changed, 29 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 6572ec1712ca..4d5f7794194a 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -160,6 +160,35 @@ int can_get_bittiming(const struct net_device *dev, struct can_bittiming *bt, const unsigned int bitrate_const_cnt, struct netlink_ext_ack *extack); +/* + * can_get_relative_tdco() - TDCO relative to the sample point + * + * struct can_tdc::tdco represents the absolute offset from TDCV. Some + * controllers use instead an offset relative to the Sample Point (SP) + * such that: + * + * SSP = TDCV + absolute TDCO + * = TDCV + SP + relative TDCO + * + * -+----------- one bit ----------+-- TX pin + * |<--- Sample Point --->| + * + * --+----------- one bit ----------+-- RX pin + * |<-------- TDCV -------->| + * |<------------------------>| absolute TDCO + * |<--- Sample Point --->| + * | |<->| relative TDCO + * |<------------- Secondary Sample Point ------------>| + */ +static inline s32 can_get_relative_tdco(const struct data_bittiming_params *dbt_params) +{ + const struct can_bittiming *dbt = &dbt_params->data_bittiming; + s32 sample_point_in_tc = (CAN_SYNC_SEG + dbt->prop_seg + + dbt->phase_seg1) * dbt->brp; + + return (s32)dbt_params->tdc.tdco - sample_point_in_tc; +} + /* * can_bit_time() - Duration of one bit * diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index c2fe956ab776..8e75e9b3830a 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -85,35 +85,6 @@ static inline bool can_fd_tdc_is_enabled(const struct can_priv *priv) return !!(priv->ctrlmode & CAN_CTRLMODE_FD_TDC_MASK); } -/* - * can_get_relative_tdco() - TDCO relative to the sample point - * - * struct can_tdc::tdco represents the absolute offset from TDCV. Some - * controllers use instead an offset relative to the Sample Point (SP) - * such that: - * - * SSP = TDCV + absolute TDCO - * = TDCV + SP + relative TDCO - * - * -+----------- one bit ----------+-- TX pin - * |<--- Sample Point --->| - * - * --+----------- one bit ----------+-- RX pin - * |<-------- TDCV -------->| - * |<------------------------>| absolute TDCO - * |<--- Sample Point --->| - * | |<->| relative TDCO - * |<------------- Secondary Sample Point ------------>| - */ -static inline s32 can_get_relative_tdco(const struct can_priv *priv) -{ - const struct can_bittiming *dbt = &priv->fd.data_bittiming; - s32 sample_point_in_tc = (CAN_SYNC_SEG + dbt->prop_seg + - dbt->phase_seg1) * dbt->brp; - - return (s32)priv->fd.tdc.tdco - sample_point_in_tc; -} - static inline u32 can_get_static_ctrlmode(struct can_priv *priv) { return priv->ctrlmode & ~priv->ctrlmode_supported; -- cgit v1.2.3 From b23a8425cba5d7908d69f3bce8f3c697362b50ae Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:30 +0900 Subject: can: netlink: add can_validate_tdc() Factorise the TDC validation out of can_validate() and move it in the new can_validate_tdc() function. This is a preparation patch for the introduction of CAN XL because this TDC validation will be reused later on. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-5-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/netlink.c | 82 +++++++++++++++++++++++++------------------ include/linux/can/bittiming.h | 4 +++ 2 files changed, 52 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index 13555253e789..25c08adee9ad 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -57,6 +57,49 @@ static int can_validate_bittiming(struct nlattr *data[], return 0; } +static int can_validate_tdc(struct nlattr *data_tdc, + struct netlink_ext_ack *extack, u32 tdc_flags) +{ + bool tdc_manual = tdc_flags & CAN_CTRLMODE_TDC_MANUAL_MASK; + bool tdc_auto = tdc_flags & CAN_CTRLMODE_TDC_AUTO_MASK; + int err; + + /* CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually exclusive */ + if (tdc_auto && tdc_manual) + return -EOPNOTSUPP; + + /* If one of the CAN_CTRLMODE_TDC_* flag is set then TDC + * must be set and vice-versa + */ + if ((tdc_auto || tdc_manual) != !!data_tdc) + return -EOPNOTSUPP; + + /* If providing TDC parameters, at least TDCO is needed. TDCV + * is needed if and only if CAN_CTRLMODE_TDC_MANUAL is set + */ + if (data_tdc) { + struct nlattr *tb_tdc[IFLA_CAN_TDC_MAX + 1]; + + err = nla_parse_nested(tb_tdc, IFLA_CAN_TDC_MAX, + data_tdc, can_tdc_policy, extack); + if (err) + return err; + + if (tb_tdc[IFLA_CAN_TDC_TDCV]) { + if (tdc_auto) + return -EOPNOTSUPP; + } else { + if (tdc_manual) + return -EOPNOTSUPP; + } + + if (!tb_tdc[IFLA_CAN_TDC_TDCO]) + return -EOPNOTSUPP; + } + + return 0; +} + static int can_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -67,7 +110,7 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[], * - nominal/arbitration bittiming * - data bittiming * - control mode with CAN_CTRLMODE_FD set - * - TDC parameters are coherent (details below) + * - TDC parameters are coherent (details in can_validate_tdc()) */ if (!data) @@ -75,42 +118,13 @@ static int can_validate(struct nlattr *tb[], struct nlattr *data[], if (data[IFLA_CAN_CTRLMODE]) { struct can_ctrlmode *cm = nla_data(data[IFLA_CAN_CTRLMODE]); - u32 tdc_flags = cm->flags & CAN_CTRLMODE_FD_TDC_MASK; is_can_fd = cm->flags & cm->mask & CAN_CTRLMODE_FD; - /* CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually exclusive */ - if (tdc_flags == CAN_CTRLMODE_FD_TDC_MASK) - return -EOPNOTSUPP; - /* If one of the CAN_CTRLMODE_TDC_* flag is set then - * TDC must be set and vice-versa - */ - if (!!tdc_flags != !!data[IFLA_CAN_TDC]) - return -EOPNOTSUPP; - /* If providing TDC parameters, at least TDCO is - * needed. TDCV is needed if and only if - * CAN_CTRLMODE_TDC_MANUAL is set - */ - if (data[IFLA_CAN_TDC]) { - struct nlattr *tb_tdc[IFLA_CAN_TDC_MAX + 1]; - - err = nla_parse_nested(tb_tdc, IFLA_CAN_TDC_MAX, - data[IFLA_CAN_TDC], - can_tdc_policy, extack); - if (err) - return err; - - if (tb_tdc[IFLA_CAN_TDC_TDCV]) { - if (tdc_flags & CAN_CTRLMODE_TDC_AUTO) - return -EOPNOTSUPP; - } else { - if (tdc_flags & CAN_CTRLMODE_TDC_MANUAL) - return -EOPNOTSUPP; - } - - if (!tb_tdc[IFLA_CAN_TDC_TDCO]) - return -EOPNOTSUPP; - } + err = can_validate_tdc(data[IFLA_CAN_TDC], extack, + cm->flags & CAN_CTRLMODE_FD_TDC_MASK); + if (err) + return err; } err = can_validate_bittiming(data, extack, IFLA_CAN_BITTIMING); diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 4d5f7794194a..71f839c3f032 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -16,6 +16,10 @@ #define CAN_CTRLMODE_FD_TDC_MASK \ (CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL) +#define CAN_CTRLMODE_TDC_AUTO_MASK \ + (CAN_CTRLMODE_TDC_AUTO) +#define CAN_CTRLMODE_TDC_MANUAL_MASK \ + (CAN_CTRLMODE_TDC_MANUAL) /* * struct can_tdc - CAN FD Transmission Delay Compensation parameters -- cgit v1.2.3 From 6ffc1230d3a728e07d7d2464f388ad4bbefe90c2 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:43 +0900 Subject: can: calc_bittiming: make can_calc_tdco() FD agnostic can_calc_tdco() uses the CAN_CTRLMODE_FD_TDC_MASK and CAN_CTRLMODE_TDC_AUTO macros making it specific to CAN FD. Add the tdc mask to the function parameter list. The value of the tdc auto flag can then be derived from that mask and stored in a local variable. This way, the function becomes CAN FD agnostic and can be reused later on for the CAN XL TDC. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-18-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/calc_bittiming.c | 10 ++++++---- drivers/net/can/dev/netlink.c | 2 +- include/linux/can/bittiming.h | 4 ++-- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/can/dev/calc_bittiming.c b/drivers/net/can/dev/calc_bittiming.c index a94bd67c670c..394d6974f481 100644 --- a/drivers/net/can/dev/calc_bittiming.c +++ b/drivers/net/can/dev/calc_bittiming.c @@ -173,13 +173,15 @@ int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, const struct can_bittiming *dbt, - u32 *ctrlmode, u32 ctrlmode_supported) + u32 tdc_mask, u32 *ctrlmode, u32 ctrlmode_supported) { - if (!tdc_const || !(ctrlmode_supported & CAN_CTRLMODE_TDC_AUTO)) + u32 tdc_auto = tdc_mask & CAN_CTRLMODE_TDC_AUTO_MASK; + + if (!tdc_const || !(ctrlmode_supported & tdc_auto)) return; - *ctrlmode &= ~CAN_CTRLMODE_FD_TDC_MASK; + *ctrlmode &= ~tdc_mask; /* As specified in ISO 11898-1 section 11.3.3 "Transmitter * delay compensation" (TDC) is only applicable if data BRP is @@ -193,6 +195,6 @@ void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, if (sample_point_in_tc < tdc_const->tdco_min) return; tdc->tdco = min(sample_point_in_tc, tdc_const->tdco_max); - *ctrlmode |= CAN_CTRLMODE_TDC_AUTO; + *ctrlmode |= tdc_auto; } } diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index 99038e0fb25f..92d8df13e886 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -341,7 +341,7 @@ static int can_dbt_changelink(struct net_device *dev, struct nlattr *data[], * do calculation */ can_calc_tdco(&dbt_params->tdc, dbt_params->tdc_const, &dbt, - &priv->ctrlmode, priv->ctrlmode_supported); + tdc_mask, &priv->ctrlmode, priv->ctrlmode_supported); } /* else: both CAN_CTRLMODE_TDC_{AUTO,MANUAL} are explicitly * turned off. TDC is disabled: do nothing */ diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 71f839c3f032..d30816dd93c7 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -135,7 +135,7 @@ int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, const struct can_bittiming *dbt, - u32 *ctrlmode, u32 ctrlmode_supported); + u32 tdc_mask, u32 *ctrlmode, u32 ctrlmode_supported); #else /* !CONFIG_CAN_CALC_BITTIMING */ static inline int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, @@ -148,7 +148,7 @@ can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, static inline void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, const struct can_bittiming *dbt, - u32 *ctrlmode, u32 ctrlmode_supported) + u32 tdc_mask, u32 *ctrlmode, u32 ctrlmode_supported) { } #endif /* CONFIG_CAN_CALC_BITTIMING */ -- cgit v1.2.3 From 7de54546fff11cb0a53f47847d62f7b1a5792d17 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:44 +0900 Subject: can: dev: add can_get_ctrlmode_str() In an effort to give more human readable messages when errors occur because of conflicting options, it can be useful to convert the CAN control mode flags into text. Add a function which converts the first set CAN control mode into a human readable string. The reason to only convert the first one is to simplify edge cases: imagine that there are several invalid control modes, we would just return the first invalid one to the user, thus not having to handle complex string concatenation. The user can then solve the first problem, call the netlink interface again and see the next issue. People who wish to enumerate all the control modes can still do so by, for example, using this new function in a for_each_set_bit() loop. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-19-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 33 +++++++++++++++++++++++++++++++++ include/linux/can/dev.h | 2 ++ 2 files changed, 35 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index befdeb4c54c2..15ccedbb3f8d 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -88,6 +88,39 @@ const char *can_get_state_str(const enum can_state state) } EXPORT_SYMBOL_GPL(can_get_state_str); +const char *can_get_ctrlmode_str(u32 ctrlmode) +{ + switch (ctrlmode & ~(ctrlmode - 1)) { + case 0: + return "none"; + case CAN_CTRLMODE_LOOPBACK: + return "loopback"; + case CAN_CTRLMODE_LISTENONLY: + return "listen-only"; + case CAN_CTRLMODE_3_SAMPLES: + return "triple-sampling"; + case CAN_CTRLMODE_ONE_SHOT: + return "one-shot"; + case CAN_CTRLMODE_BERR_REPORTING: + return "berr-reporting"; + case CAN_CTRLMODE_FD: + return "fd"; + case CAN_CTRLMODE_PRESUME_ACK: + return "presume-ack"; + case CAN_CTRLMODE_FD_NON_ISO: + return "fd-non-iso"; + case CAN_CTRLMODE_CC_LEN8_DLC: + return "cc-len8-dlc"; + case CAN_CTRLMODE_TDC_AUTO: + return "fd-tdc-auto"; + case CAN_CTRLMODE_TDC_MANUAL: + return "fd-tdc-manual"; + default: + return ""; + } +} +EXPORT_SYMBOL_GPL(can_get_ctrlmode_str); + static enum can_state can_state_err_to_state(u16 err) { if (err < CAN_ERROR_WARNING_THRESHOLD) diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 8e75e9b3830a..a2229a61ccde 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -141,6 +141,8 @@ int can_restart_now(struct net_device *dev); void can_bus_off(struct net_device *dev); const char *can_get_state_str(const enum can_state state); +const char *can_get_ctrlmode_str(u32 ctrlmode); + void can_state_get_by_berr_counter(const struct net_device *dev, const struct can_berr_counter *bec, enum can_state *tx_state, -- cgit v1.2.3 From 83fb49389bbe07defb85b063f7ff0fd016f06b35 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 18 Sep 2025 10:05:50 +0200 Subject: modpost: Add modname to mod_device_table alias At this point, if a symbol is compiled as part of the kernel, information about which module the symbol belongs to is lost. To save this it is possible to add the module name to the alias name. It's not very pretty, but it's possible for now. Cc: Miguel Ojeda Cc: Andreas Hindborg Cc: Danilo Krummrich Cc: Alex Gaynor Cc: rust-for-linux@vger.kernel.org Signed-off-by: Alexey Gladkov Acked-by: Danilo Krummrich Acked-by: Nicolas Schier Link: https://patch.msgid.link/1a0d0bd87a4981d465b9ed21e14f4e78eaa03ded.1758182101.git.legion@kernel.org Signed-off-by: Nathan Chancellor --- include/linux/module.h | 14 +++++++++++++- rust/kernel/device_id.rs | 8 ++++---- scripts/mod/file2alias.c | 15 ++++++++++++--- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 3319a5269d28..e31ee29fac6b 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -244,10 +244,22 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name); /* What your module does. */ #define MODULE_DESCRIPTION(_description) MODULE_INFO(description, _description) +/* + * Format: __mod_device_table__kmod_____ + * Parts of the string `__kmod_` and `__` are used as delimiters when parsing + * a symbol in file2alias.c + */ +#define __mod_device_table(type, name) \ + __PASTE(__mod_device_table__, \ + __PASTE(__KBUILD_MODNAME, \ + __PASTE(__, \ + __PASTE(type, \ + __PASTE(__, name))))) + #ifdef MODULE /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ -static typeof(name) __mod_device_table__##type##__##name \ +static typeof(name) __mod_device_table(type, name) \ __attribute__ ((used, alias(__stringify(name)))) #else /* !MODULE */ #define MODULE_DEVICE_TABLE(type, name) diff --git a/rust/kernel/device_id.rs b/rust/kernel/device_id.rs index 70d57814ff79..62c42da12e9d 100644 --- a/rust/kernel/device_id.rs +++ b/rust/kernel/device_id.rs @@ -195,10 +195,10 @@ macro_rules! module_device_table { ($table_type: literal, $module_table_name:ident, $table_name:ident) => { #[rustfmt::skip] #[export_name = - concat!("__mod_device_table__", $table_type, - "__", module_path!(), - "_", line!(), - "_", stringify!($table_name)) + concat!("__mod_device_table__", line!(), + "__kmod_", module_path!(), + "__", $table_type, + "__", stringify!($table_name)) ] static $module_table_name: [::core::mem::MaybeUninit; $table_name.raw_ids().size()] = unsafe { ::core::mem::transmute_copy($table_name.raw_ids()) }; diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 00586119a25b..1260bc2287fb 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1476,7 +1476,7 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, { void *symval; char *zeros = NULL; - const char *type, *name; + const char *type, *name, *modname; size_t typelen; static const char *prefix = "__mod_device_table__"; @@ -1488,10 +1488,19 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, if (ELF_ST_TYPE(sym->st_info) != STT_OBJECT) return; - /* All our symbols are of form __mod_device_table____. */ + /* All our symbols are of form __mod_device_table__kmod_____. */ if (!strstarts(symname, prefix)) return; - type = symname + strlen(prefix); + + modname = strstr(symname, "__kmod_"); + if (!modname) + return; + modname += strlen("__kmod_"); + + type = strstr(modname, "__"); + if (!type) + return; + type += strlen("__"); name = strstr(type, "__"); if (!name) -- cgit v1.2.3 From 5ab23c7923a1d2ae1890026866a2d8506b010a4a Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 18 Sep 2025 10:05:51 +0200 Subject: modpost: Create modalias for builtin modules For some modules, modalias is generated using the modpost utility and the section is added to the module file. When a module is added inside vmlinux, modpost does not generate modalias for such modules and the information is lost. As a result kmod (which uses modules.builtin.modinfo in userspace) cannot determine that modalias is handled by a builtin kernel module. $ cat /sys/devices/pci0000:00/0000:00:14.0/modalias pci:v00008086d0000A36Dsv00001043sd00008694bc0Csc03i30 $ modinfo xhci_pci name: xhci_pci filename: (builtin) license: GPL file: drivers/usb/host/xhci-pci description: xHCI PCI Host Controller Driver Missing modalias "pci:v*d*sv*sd*bc0Csc03i30*" which will be generated by modpost if the module is built separately. To fix this it is necessary to generate the same modalias for vmlinux as for the individual modules. Fortunately '.vmlinux.export.o' is already generated from which '.modinfo' can be extracted in the same way as for vmlinux.o. Signed-off-by: Masahiro Yamada Signed-off-by: Alexey Gladkov Tested-by: Stephen Rothwell Reviewed-by: Nicolas Schier Link: https://patch.msgid.link/28d4da3b0e3fc8474142746bcf469e03752c3208.1758182101.git.legion@kernel.org Signed-off-by: Nathan Chancellor --- include/linux/module.h | 4 ---- scripts/Makefile.vmlinux | 4 +++- scripts/mksysmap | 3 +++ scripts/mod/file2alias.c | 19 ++++++++++++++++++- scripts/mod/modpost.c | 15 +++++++++++++++ scripts/mod/modpost.h | 2 ++ 6 files changed, 41 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index e31ee29fac6b..e135cc79acee 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -256,14 +256,10 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name); __PASTE(type, \ __PASTE(__, name))))) -#ifdef MODULE /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ static typeof(name) __mod_device_table(type, name) \ __attribute__ ((used, alias(__stringify(name)))) -#else /* !MODULE */ -#define MODULE_DEVICE_TABLE(type, name) -#endif /* Version of form [:][-]. * Or for CVS/RCS ID version, everything but the number is stripped. diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index ce7946171497..1e5e37aadcd0 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -89,11 +89,13 @@ endif remove-section-y := .modinfo remove-section-$(CONFIG_ARCH_VMLINUX_NEEDS_RELOCS) += '.rel*' +remove-symbols := -w --strip-symbol='__mod_device_table__*' + # To avoid warnings: "empty loadable segment detected at ..." from GNU objcopy, # it is necessary to remove the PT_LOAD flag from the segment. quiet_cmd_strip_relocs = OBJCOPY $@ cmd_strip_relocs = $(OBJCOPY) $(patsubst %,--set-section-flags %=noload,$(remove-section-y)) $< $@; \ - $(OBJCOPY) $(addprefix --remove-section=,$(remove-section-y)) $@ + $(OBJCOPY) $(addprefix --remove-section=,$(remove-section-y)) $(remove-symbols) $@ targets += vmlinux vmlinux: vmlinux.unstripped FORCE diff --git a/scripts/mksysmap b/scripts/mksysmap index a607a0059d11..c4531eacde20 100755 --- a/scripts/mksysmap +++ b/scripts/mksysmap @@ -59,6 +59,9 @@ # EXPORT_SYMBOL (namespace) / __kstrtabns_/d +# MODULE_DEVICE_TABLE (symbol name) +/ __mod_device_table__/d + # --------------------------------------------------------------------------- # Ignored suffixes # (do not forget '$' after each pattern) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 1260bc2287fb..7da9735e7ab3 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -1477,7 +1477,7 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, void *symval; char *zeros = NULL; const char *type, *name, *modname; - size_t typelen; + size_t typelen, modnamelen; static const char *prefix = "__mod_device_table__"; /* We're looking for a section relative symbol */ @@ -1500,6 +1500,7 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, type = strstr(modname, "__"); if (!type) return; + modnamelen = type - modname; type += strlen("__"); name = strstr(type, "__"); @@ -1526,5 +1527,21 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, } } + if (mod->is_vmlinux) { + struct module_alias *alias; + + /* + * If this is vmlinux, record the name of the builtin module. + * Traverse the linked list in the reverse order, and set the + * builtin_modname unless it has already been set in the + * previous call. + */ + list_for_each_entry_reverse(alias, &mod->aliases, node) { + if (alias->builtin_modname) + break; + alias->builtin_modname = xstrndup(modname, modnamelen); + } + } + free(zeros); } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 5ca7c268294e..47c8aa2a6939 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -2067,11 +2067,26 @@ static void write_if_changed(struct buffer *b, const char *fname) static void write_vmlinux_export_c_file(struct module *mod) { struct buffer buf = { }; + struct module_alias *alias, *next; buf_printf(&buf, "#include \n"); add_exported_symbols(&buf, mod); + + buf_printf(&buf, + "#include \n" + "#undef __MODULE_INFO_PREFIX\n" + "#define __MODULE_INFO_PREFIX\n"); + + list_for_each_entry_safe(alias, next, &mod->aliases, node) { + buf_printf(&buf, "MODULE_INFO(%s.alias, \"%s\");\n", + alias->builtin_modname, alias->str); + list_del(&alias->node); + free(alias->builtin_modname); + free(alias); + } + write_if_changed(&buf, ".vmlinux.export.c"); free(buf.p); } diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 9133e4c3803f..2aecb8f25c87 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -99,10 +99,12 @@ buf_write(struct buffer *buf, const char *s, int len); * struct module_alias - auto-generated MODULE_ALIAS() * * @node: linked to module::aliases + * @modname: name of the builtin module (only for vmlinux) * @str: a string for MODULE_ALIAS() */ struct module_alias { struct list_head node; + char *builtin_modname; char str[]; }; -- cgit v1.2.3 From 23ef9d439769d5f35353650e771c63d13824235b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 23 Sep 2025 14:34:19 -0700 Subject: kcfi: Rename CONFIG_CFI_CLANG to CONFIG_CFI The kernel's CFI implementation uses the KCFI ABI specifically, and is not strictly tied to a particular compiler. In preparation for GCC supporting KCFI, rename CONFIG_CFI_CLANG to CONFIG_CFI (along with associated options). Use new "transitional" Kconfig option for old CONFIG_CFI_CLANG that will enable CONFIG_CFI during olddefconfig. Reviewed-by: Linus Walleij Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250923213422.1105654-3-kees@kernel.org Signed-off-by: Kees Cook --- Makefile | 2 +- arch/Kconfig | 36 +++++++++++++++++++++------------ arch/arm/Kconfig | 2 +- arch/arm/kernel/hw_breakpoint.c | 2 +- arch/arm/mm/Makefile | 2 +- arch/arm/mm/cache-fa.S | 2 +- arch/arm/mm/cache-v4.S | 2 +- arch/arm/mm/cache-v4wb.S | 4 ++-- arch/arm/mm/cache-v4wt.S | 2 +- arch/arm/mm/cache-v6.S | 2 +- arch/arm/mm/cache-v7.S | 2 +- arch/arm/mm/cache-v7m.S | 2 +- arch/arm/mm/proc-arm1020.S | 2 +- arch/arm/mm/proc-arm1020e.S | 2 +- arch/arm/mm/proc-arm1022.S | 2 +- arch/arm/mm/proc-arm1026.S | 2 +- arch/arm/mm/proc-arm920.S | 2 +- arch/arm/mm/proc-arm922.S | 2 +- arch/arm/mm/proc-arm925.S | 2 +- arch/arm/mm/proc-arm926.S | 2 +- arch/arm/mm/proc-arm940.S | 2 +- arch/arm/mm/proc-arm946.S | 2 +- arch/arm/mm/proc-feroceon.S | 2 +- arch/arm/mm/proc-mohawk.S | 2 +- arch/arm/mm/proc-xsc3.S | 2 +- arch/arm/mm/tlb-v4.S | 2 +- arch/arm64/Kconfig | 4 ++-- arch/arm64/kernel/debug-monitors.c | 2 +- arch/arm64/kernel/traps.c | 4 ++-- arch/arm64/kvm/handle_exit.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/riscv/Kconfig | 6 +++--- arch/riscv/include/asm/cfi.h | 4 ++-- arch/riscv/kernel/Makefile | 2 +- arch/riscv/net/bpf_jit_comp64.c | 4 ++-- arch/riscv/purgatory/Makefile | 2 +- arch/x86/Kconfig | 12 +++++------ arch/x86/include/asm/cfi.h | 4 ++-- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/alternative.c | 4 ++-- arch/x86/kernel/kprobes/core.c | 2 +- arch/x86/purgatory/Makefile | 2 +- drivers/misc/lkdtm/cfi.c | 2 +- include/asm-generic/vmlinux.lds.h | 2 +- include/linux/cfi.h | 6 +++--- include/linux/cfi_types.h | 8 ++++---- include/linux/compiler.h | 2 +- init/Kconfig | 4 ++-- kernel/Makefile | 2 +- kernel/configs/hardening.config | 4 ++-- kernel/module/Kconfig | 2 +- kernel/module/tree_lookup.c | 2 +- lib/Kconfig.debug | 2 +- tools/include/linux/cfi_types.h | 6 +++--- tools/perf/util/include/linux/linkage.h | 2 +- 55 files changed, 100 insertions(+), 90 deletions(-) (limited to 'include/linux') diff --git a/Makefile b/Makefile index d1adb78c3596..437989d6e0be 100644 --- a/Makefile +++ b/Makefile @@ -1020,7 +1020,7 @@ KBUILD_AFLAGS += -fno-lto export CC_FLAGS_LTO endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI CC_FLAGS_CFI := -fsanitize=kcfi ifdef CONFIG_CFI_ICALL_NORMALIZE_INTEGERS CC_FLAGS_CFI += -fsanitize-cfi-icall-experimental-normalize-integers diff --git a/arch/Kconfig b/arch/Kconfig index d1b4ffd6e085..97642c08a124 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -867,22 +867,26 @@ config PROPELLER_CLANG If unsure, say N. -config ARCH_SUPPORTS_CFI_CLANG +config ARCH_SUPPORTS_CFI bool help - An architecture should select this option if it can support Clang's - Control-Flow Integrity (CFI) checking. + An architecture should select this option if it can support Kernel + Control-Flow Integrity (CFI) checking (-fsanitize=kcfi). config ARCH_USES_CFI_TRAPS bool + help + An architecture should select this option if it requires the + .kcfi_traps section for KCFI trap handling. -config CFI_CLANG - bool "Use Clang's Control Flow Integrity (CFI)" - depends on ARCH_SUPPORTS_CFI_CLANG +config CFI + bool "Use Kernel Control Flow Integrity (kCFI)" + default CFI_CLANG + depends on ARCH_SUPPORTS_CFI depends on $(cc-option,-fsanitize=kcfi) help - This option enables Clang's forward-edge Control Flow Integrity - (CFI) checking, where the compiler injects a runtime check to each + This option enables forward-edge Control Flow Integrity (CFI) + checking, where the compiler injects a runtime check to each indirect function call to ensure the target is a valid function with the correct static type. This restricts possible call targets and makes it more difficult for an attacker to exploit bugs that allow @@ -891,10 +895,16 @@ config CFI_CLANG https://clang.llvm.org/docs/ControlFlowIntegrity.html +config CFI_CLANG + bool + transitional + help + Transitional config for CFI_CLANG to CFI migration. + config CFI_ICALL_NORMALIZE_INTEGERS bool "Normalize CFI tags for integers" - depends on CFI_CLANG - depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG + depends on CFI + depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS help This option normalizes the CFI tags for integer types so that all integer types of the same size and signedness receive the same CFI @@ -907,7 +917,7 @@ config CFI_ICALL_NORMALIZE_INTEGERS This option is necessary for using CFI with Rust. If unsure, say N. -config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG +config HAVE_CFI_ICALL_NORMALIZE_INTEGERS def_bool y depends on $(cc-option,-fsanitize=kcfi -fsanitize-cfi-icall-experimental-normalize-integers) # With GCOV/KASAN we need this fix: https://github.com/llvm/llvm-project/pull/104826 @@ -915,7 +925,7 @@ config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC def_bool y - depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS_CLANG + depends on HAVE_CFI_ICALL_NORMALIZE_INTEGERS depends on RUSTC_VERSION >= 107900 # With GCOV/KASAN we need this fix: https://github.com/rust-lang/rust/pull/129373 depends on (RUSTC_LLVM_VERSION >= 190103 && RUSTC_VERSION >= 108200) || \ @@ -923,7 +933,7 @@ config HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC config CFI_PERMISSIVE bool "Use CFI in permissive mode" - depends on CFI_CLANG + depends on CFI help When selected, Control Flow Integrity (CFI) violations result in a warning instead of a kernel panic. This option should only be used diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b1f3df39ed40..36ab8625be72 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -38,7 +38,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_NEED_CMPXCHG_1_EMU if CPU_V6 select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_CFI_CLANG + select ARCH_SUPPORTS_CFI select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_USE_BUILTIN_BSWAP diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index a12efd0f43e8..cd4b34c96e35 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -904,7 +904,7 @@ unlock: watchpoint_single_step_handler(addr); } -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI static void hw_breakpoint_cfi_handler(struct pt_regs *regs) { /* diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile index a195cd1d3e6d..1e2201013371 100644 --- a/arch/arm/mm/Makefile +++ b/arch/arm/mm/Makefile @@ -89,7 +89,7 @@ obj-$(CONFIG_CPU_V6) += proc-v6.o obj-$(CONFIG_CPU_V6K) += proc-v6.o obj-$(CONFIG_CPU_V7) += proc-v7.o proc-v7-bugs.o obj-$(CONFIG_CPU_V7M) += proc-v7m.o -obj-$(CONFIG_CFI_CLANG) += proc.o +obj-$(CONFIG_CFI) += proc.o obj-$(CONFIG_OUTER_CACHE) += l2c-common.o obj-$(CONFIG_CACHE_B15_RAC) += cache-b15-rac.o diff --git a/arch/arm/mm/cache-fa.S b/arch/arm/mm/cache-fa.S index 4a3668b52a2d..e1641799569b 100644 --- a/arch/arm/mm/cache-fa.S +++ b/arch/arm/mm/cache-fa.S @@ -112,7 +112,7 @@ SYM_FUNC_END(fa_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(fa_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b fa_coherent_user_range #endif SYM_FUNC_END(fa_coherent_kern_range) diff --git a/arch/arm/mm/cache-v4.S b/arch/arm/mm/cache-v4.S index 0e94e5193dbd..001d7042bd46 100644 --- a/arch/arm/mm/cache-v4.S +++ b/arch/arm/mm/cache-v4.S @@ -104,7 +104,7 @@ SYM_FUNC_END(v4_coherent_user_range) * - size - region size */ SYM_TYPED_FUNC_START(v4_flush_kern_dcache_area) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4_dma_flush_range #endif SYM_FUNC_END(v4_flush_kern_dcache_area) diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S index ce55a2eef5da..874fe5310f9a 100644 --- a/arch/arm/mm/cache-v4wb.S +++ b/arch/arm/mm/cache-v4wb.S @@ -136,7 +136,7 @@ SYM_FUNC_END(v4wb_flush_user_cache_range) */ SYM_TYPED_FUNC_START(v4wb_flush_kern_dcache_area) add r1, r0, r1 -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wb_coherent_user_range #endif SYM_FUNC_END(v4wb_flush_kern_dcache_area) @@ -152,7 +152,7 @@ SYM_FUNC_END(v4wb_flush_kern_dcache_area) * - end - virtual end address */ SYM_TYPED_FUNC_START(v4wb_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wb_coherent_user_range #endif SYM_FUNC_END(v4wb_coherent_kern_range) diff --git a/arch/arm/mm/cache-v4wt.S b/arch/arm/mm/cache-v4wt.S index a97dc267b3b0..2ee62e4b2b07 100644 --- a/arch/arm/mm/cache-v4wt.S +++ b/arch/arm/mm/cache-v4wt.S @@ -108,7 +108,7 @@ SYM_FUNC_END(v4wt_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(v4wt_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v4wt_coherent_user_range #endif SYM_FUNC_END(v4wt_coherent_kern_range) diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S index 9f415476e218..5ceea8965ea1 100644 --- a/arch/arm/mm/cache-v6.S +++ b/arch/arm/mm/cache-v6.S @@ -117,7 +117,7 @@ SYM_FUNC_END(v6_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v6_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v6_coherent_user_range #endif SYM_FUNC_END(v6_coherent_kern_range) diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 201ca05436fa..726681fb7d4d 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -261,7 +261,7 @@ SYM_FUNC_END(v7_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v7_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v7_coherent_user_range #endif SYM_FUNC_END(v7_coherent_kern_range) diff --git a/arch/arm/mm/cache-v7m.S b/arch/arm/mm/cache-v7m.S index 14d719eba729..7f9cfad2ea21 100644 --- a/arch/arm/mm/cache-v7m.S +++ b/arch/arm/mm/cache-v7m.S @@ -286,7 +286,7 @@ SYM_FUNC_END(v7m_flush_user_cache_range) * - the Icache does not read data from the write buffer */ SYM_TYPED_FUNC_START(v7m_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b v7m_coherent_user_range #endif SYM_FUNC_END(v7m_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S index d0ce3414a13e..4612a4961e81 100644 --- a/arch/arm/mm/proc-arm1020.S +++ b/arch/arm/mm/proc-arm1020.S @@ -203,7 +203,7 @@ SYM_FUNC_END(arm1020_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1020_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1020_coherent_user_range #endif SYM_FUNC_END(arm1020_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S index 64f031bf6eff..b4a8a3a8eda3 100644 --- a/arch/arm/mm/proc-arm1020e.S +++ b/arch/arm/mm/proc-arm1020e.S @@ -200,7 +200,7 @@ SYM_FUNC_END(arm1020e_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1020e_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1020e_coherent_user_range #endif SYM_FUNC_END(arm1020e_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S index 42ed5ed07252..709870e99e19 100644 --- a/arch/arm/mm/proc-arm1022.S +++ b/arch/arm/mm/proc-arm1022.S @@ -199,7 +199,7 @@ SYM_FUNC_END(arm1022_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1022_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1022_coherent_user_range #endif SYM_FUNC_END(arm1022_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S index b3ae62cd553a..02f7370a8c5c 100644 --- a/arch/arm/mm/proc-arm1026.S +++ b/arch/arm/mm/proc-arm1026.S @@ -194,7 +194,7 @@ SYM_FUNC_END(arm1026_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm1026_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm1026_coherent_user_range #endif SYM_FUNC_END(arm1026_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S index a30df54ad5fa..4727f4b5b6e8 100644 --- a/arch/arm/mm/proc-arm920.S +++ b/arch/arm/mm/proc-arm920.S @@ -180,7 +180,7 @@ SYM_FUNC_END(arm920_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm920_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm920_coherent_user_range #endif SYM_FUNC_END(arm920_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S index aac4e048100d..5a4a3f4f2683 100644 --- a/arch/arm/mm/proc-arm922.S +++ b/arch/arm/mm/proc-arm922.S @@ -182,7 +182,7 @@ SYM_FUNC_END(arm922_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm922_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm922_coherent_user_range #endif SYM_FUNC_END(arm922_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S index 035941faeb2e..1c4830afe1d3 100644 --- a/arch/arm/mm/proc-arm925.S +++ b/arch/arm/mm/proc-arm925.S @@ -229,7 +229,7 @@ SYM_FUNC_END(arm925_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm925_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm925_coherent_user_range #endif SYM_FUNC_END(arm925_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S index 6f43d6af2d9a..a09cc3e02efd 100644 --- a/arch/arm/mm/proc-arm926.S +++ b/arch/arm/mm/proc-arm926.S @@ -192,7 +192,7 @@ SYM_FUNC_END(arm926_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm926_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm926_coherent_user_range #endif SYM_FUNC_END(arm926_coherent_kern_range) diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S index 0d30bb25c42b..545c076c36d2 100644 --- a/arch/arm/mm/proc-arm940.S +++ b/arch/arm/mm/proc-arm940.S @@ -153,7 +153,7 @@ SYM_FUNC_END(arm940_coherent_kern_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm940_coherent_user_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm940_flush_kern_dcache_area #endif SYM_FUNC_END(arm940_coherent_user_range) diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S index 27750ace2ced..f3d4e18c3fba 100644 --- a/arch/arm/mm/proc-arm946.S +++ b/arch/arm/mm/proc-arm946.S @@ -173,7 +173,7 @@ SYM_FUNC_END(arm946_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(arm946_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b arm946_coherent_user_range #endif SYM_FUNC_END(arm946_coherent_kern_range) diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S index f67b2ffac854..7f08d06c9625 100644 --- a/arch/arm/mm/proc-feroceon.S +++ b/arch/arm/mm/proc-feroceon.S @@ -208,7 +208,7 @@ SYM_FUNC_END(feroceon_flush_user_cache_range) */ .align 5 SYM_TYPED_FUNC_START(feroceon_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b feroceon_coherent_user_range #endif SYM_FUNC_END(feroceon_coherent_kern_range) diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S index 8e9f38da863a..4669c63e3121 100644 --- a/arch/arm/mm/proc-mohawk.S +++ b/arch/arm/mm/proc-mohawk.S @@ -163,7 +163,7 @@ SYM_FUNC_END(mohawk_flush_user_cache_range) * - end - virtual end address */ SYM_TYPED_FUNC_START(mohawk_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b mohawk_coherent_user_range #endif SYM_FUNC_END(mohawk_coherent_kern_range) diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S index 14927b380452..fd25634a2ed5 100644 --- a/arch/arm/mm/proc-xsc3.S +++ b/arch/arm/mm/proc-xsc3.S @@ -223,7 +223,7 @@ SYM_FUNC_END(xsc3_flush_user_cache_range) * it also trashes the mini I-cache used by JTAG debuggers. */ SYM_TYPED_FUNC_START(xsc3_coherent_kern_range) -#ifdef CONFIG_CFI_CLANG /* Fallthrough if !CFI */ +#ifdef CONFIG_CFI /* Fallthrough if !CFI */ b xsc3_coherent_user_range #endif SYM_FUNC_END(xsc3_coherent_kern_range) diff --git a/arch/arm/mm/tlb-v4.S b/arch/arm/mm/tlb-v4.S index 09ff69008d94..079774a02be6 100644 --- a/arch/arm/mm/tlb-v4.S +++ b/arch/arm/mm/tlb-v4.S @@ -52,7 +52,7 @@ SYM_FUNC_END(v4_flush_user_tlb_range) * - start - virtual address (may not be aligned) * - end - virtual address (may not be aligned) */ -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI SYM_TYPED_FUNC_START(v4_flush_kern_tlb_range) b .v4_flush_kern_tlb_range SYM_FUNC_END(v4_flush_kern_tlb_range) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a6..1e38b8885a46 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -100,7 +100,7 @@ config ARM64 select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK select ARCH_SUPPORTS_LTO_CLANG if CPU_LITTLE_ENDIAN select ARCH_SUPPORTS_LTO_CLANG_THIN - select ARCH_SUPPORTS_CFI_CLANG + select ARCH_SUPPORTS_CFI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING @@ -212,7 +212,7 @@ config ARM64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS \ if DYNAMIC_FTRACE_WITH_ARGS && DYNAMIC_FTRACE_WITH_CALL_OPS select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS \ - if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG && \ + if (DYNAMIC_FTRACE_WITH_ARGS && !CFI && \ (CC_IS_CLANG || !CC_OPTIMIZE_FOR_SIZE)) select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \ if DYNAMIC_FTRACE_WITH_ARGS diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 110d9ff54174..ebf010443e22 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -212,7 +212,7 @@ static int call_el1_break_hook(struct pt_regs *regs, unsigned long esr) if (esr_brk_comment(esr) == BUG_BRK_IMM) return bug_brk_handler(regs, esr); - if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) + if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) return cfi_brk_handler(regs, esr); if (esr_brk_comment(esr) == FAULT_BRK_IMM) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index f528b6041f6a..5041817af267 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -1015,7 +1015,7 @@ int bug_brk_handler(struct pt_regs *regs, unsigned long esr) return DBG_HOOK_HANDLED; } -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) { unsigned long target; @@ -1039,7 +1039,7 @@ int cfi_brk_handler(struct pt_regs *regs, unsigned long esr) arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); return DBG_HOOK_HANDLED; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ int reserved_fault_brk_handler(struct pt_regs *regs, unsigned long esr) { diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a598072f36d2..8bdb1eed090a 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -545,7 +545,7 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else print_nvhe_hyp_panic("BUG", panic_addr); - } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) { + } else if (IS_ENABLED(CONFIG_CFI) && esr_is_cfi_brk(esr)) { kvm_nvhe_report_cfi_failure(panic_addr); } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) && ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 52ffe115a8c4..28996e0a9b00 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -185,7 +185,7 @@ static inline void emit_bti(u32 insn, struct jit_ctx *ctx) static inline void emit_kcfi(u32 hash, struct jit_ctx *ctx) { - if (IS_ENABLED(CONFIG_CFI_CLANG)) + if (IS_ENABLED(CONFIG_CFI)) emit_u32_data(hash, ctx); } diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a4b233a0659e..6043ad82b73c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -60,7 +60,7 @@ config RISCV select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW # clang >= 17: https://github.com/llvm/llvm-project/commit/62fa708ceb027713b386c7e0efda994f8bdc27e2 - select ARCH_SUPPORTS_CFI_CLANG if CLANG_VERSION >= 170000 + select ARCH_SUPPORTS_CFI if (!CC_IS_CLANG || CLANG_VERSION >= 170000) select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_HUGETLBFS if MMU @@ -76,7 +76,7 @@ config RISCV select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_SYM_ANNOTATIONS - select ARCH_USES_CFI_TRAPS if CFI_CLANG + select ARCH_USES_CFI_TRAPS if CFI select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if MMU select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS @@ -154,7 +154,7 @@ config RISCV select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE) select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS - select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG) + select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI) select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS diff --git a/arch/riscv/include/asm/cfi.h b/arch/riscv/include/asm/cfi.h index 4508aaa7a2fd..710aa8192edd 100644 --- a/arch/riscv/include/asm/cfi.h +++ b/arch/riscv/include/asm/cfi.h @@ -11,7 +11,7 @@ struct pt_regs; -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); #define __bpfcall #else @@ -19,6 +19,6 @@ static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { return BUG_TRAP_TYPE_NONE; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #endif /* _ASM_RISCV_CFI_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index c7b542573407..f60fce69b725 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -113,7 +113,7 @@ obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_EFI) += efi.o obj-$(CONFIG_COMPAT) += compat_syscall_table.o diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 10e01ff06312..24ba546a1c0e 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -18,7 +18,7 @@ #define RV_MAX_REG_ARGS 8 #define RV_FENTRY_NINSNS 2 #define RV_FENTRY_NBYTES (RV_FENTRY_NINSNS * 4) -#define RV_KCFI_NINSNS (IS_ENABLED(CONFIG_CFI_CLANG) ? 1 : 0) +#define RV_KCFI_NINSNS (IS_ENABLED(CONFIG_CFI) ? 1 : 0) /* imm that allows emit_imm to emit max count insns */ #define RV_MAX_COUNT_IMM 0x7FFF7FF7FF7FF7FF @@ -469,7 +469,7 @@ static int emit_call(u64 addr, bool fixed_addr, struct rv_jit_context *ctx) static inline void emit_kcfi(u32 hash, struct rv_jit_context *ctx) { - if (IS_ENABLED(CONFIG_CFI_CLANG)) + if (IS_ENABLED(CONFIG_CFI)) emit(hash, ctx); } diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile index 240592e3f5c2..530e497ca2f9 100644 --- a/arch/riscv/purgatory/Makefile +++ b/arch/riscv/purgatory/Makefile @@ -71,7 +71,7 @@ ifdef CONFIG_STACKPROTECTOR_STRONG PURGATORY_CFLAGS_REMOVE += -fstack-protector-strong endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI) endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100..b6da2d37cfd1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -127,8 +127,8 @@ config X86 select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 - select ARCH_SUPPORTS_CFI_CLANG if X86_64 - select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG + select ARCH_SUPPORTS_CFI if X86_64 + select ARCH_USES_CFI_TRAPS if X86_64 && CFI select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN select ARCH_SUPPORTS_RT @@ -2396,11 +2396,11 @@ config FUNCTION_PADDING_CFI default 3 if FUNCTION_ALIGNMENT_8B default 0 -# Basically: FUNCTION_ALIGNMENT - 5*CFI_CLANG +# Basically: FUNCTION_ALIGNMENT - 5*CFI # except Kconfig can't do arithmetic :/ config FUNCTION_PADDING_BYTES int - default FUNCTION_PADDING_CFI if CFI_CLANG + default FUNCTION_PADDING_CFI if CFI default FUNCTION_ALIGNMENT config CALL_PADDING @@ -2410,7 +2410,7 @@ config CALL_PADDING config FINEIBT def_bool y - depends on X86_KERNEL_IBT && CFI_CLANG && MITIGATION_RETPOLINE + depends on X86_KERNEL_IBT && CFI && MITIGATION_RETPOLINE select CALL_PADDING config FINEIBT_BHI @@ -2427,7 +2427,7 @@ config CALL_THUNKS config PREFIX_SYMBOLS def_bool y - depends on CALL_PADDING && !CFI_CLANG + depends on CALL_PADDING && !CFI menuconfig CPU_MITIGATIONS bool "Mitigations for CPU vulnerabilities" diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h index 1751f1eb95ef..976b90a3d190 100644 --- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -113,7 +113,7 @@ extern bhi_thunk __bhi_args_end[]; struct pt_regs; -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); #define __bpfcall @@ -157,7 +157,7 @@ static inline int cfi_get_func_arity(void *func) { return 0; } -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #if HAS_KERNEL_IBT == 1 #define CFI_NOSEAL(x) asm(IBT_NOSEAL(__stringify(x))) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d2a6d953be9..bc184dd38d99 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -148,7 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_CALL_THUNKS) += callthunks.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7bde68247b5f..79ae9cb50019 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1170,7 +1170,7 @@ void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #ifdef CONFIG_CFI_AUTO_DEFAULT # define __CFI_DEFAULT CFI_AUTO -#elif defined(CONFIG_CFI_CLANG) +#elif defined(CONFIG_CFI) # define __CFI_DEFAULT CFI_KCFI #else # define __CFI_DEFAULT CFI_OFF @@ -1182,7 +1182,7 @@ enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; bool cfi_bhi __ro_after_init = false; #endif -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI u32 cfi_get_func_hash(void *func) { u32 hash; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 6079d15dab8c..3863d7709386 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -339,7 +339,7 @@ static bool can_probe(unsigned long paddr) if (is_exception_insn(&insn)) return false; - if (IS_ENABLED(CONFIG_CFI_CLANG)) { + if (IS_ENABLED(CONFIG_CFI)) { /* * The compiler generates the following instruction sequence * for indirect call checks and cfi.c decodes this; diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index e0a607a14e7e..5ce1d4263000 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -57,7 +57,7 @@ ifdef CONFIG_MITIGATION_RETPOLINE PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS) endif -ifdef CONFIG_CFI_CLANG +ifdef CONFIG_CFI PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI) endif diff --git a/drivers/misc/lkdtm/cfi.c b/drivers/misc/lkdtm/cfi.c index 6a33889d0902..c3971f7caa65 100644 --- a/drivers/misc/lkdtm/cfi.c +++ b/drivers/misc/lkdtm/cfi.c @@ -43,7 +43,7 @@ static void lkdtm_CFI_FORWARD_PROTO(void) lkdtm_indirect_call((void *)lkdtm_increment_int); pr_err("FAIL: survived mismatched prototype function call!\n"); - pr_expected_config(CONFIG_CFI_CLANG); + pr_expected_config(CONFIG_CFI); } /* diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ae2d2359b79e..a65a87366c48 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -157,7 +157,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define PATCHABLE_DISCARDS *(__patchable_function_entries) #endif -#ifndef CONFIG_ARCH_SUPPORTS_CFI_CLANG +#ifndef CONFIG_ARCH_SUPPORTS_CFI /* * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 52a98886a455..1fd22ea6eba4 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,7 +11,7 @@ #include #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI extern bool cfi_warn; enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, @@ -52,7 +52,7 @@ static inline u32 cfi_get_func_hash(void *func) extern u32 cfi_bpf_hash; extern u32 cfi_bpf_subprog_hash; -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ static inline int cfi_get_offset(void) { return 0; } static inline u32 cfi_get_func_hash(void *func) { return 0; } @@ -60,7 +60,7 @@ static inline u32 cfi_get_func_hash(void *func) { return 0; } #define cfi_bpf_hash 0U #define cfi_bpf_subprog_hash 0U -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifdef CONFIG_ARCH_USES_CFI_TRAPS bool is_cfi_trap(unsigned long addr); diff --git a/include/linux/cfi_types.h b/include/linux/cfi_types.h index 685f7181780f..a86af9bc8bdc 100644 --- a/include/linux/cfi_types.h +++ b/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_ type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,12 +29,12 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ @@ -43,7 +43,7 @@ #else /* __ASSEMBLY__ */ -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI #define DEFINE_CFI_TYPE(name, func) \ /* \ * Force a reference to the function so the compiler generates \ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 6f04a1d8c720..fb27da2221ee 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -248,7 +248,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ -#if defined(CONFIG_CFI_CLANG) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) +#if defined(CONFIG_CFI) && !defined(__DISABLE_EXPORTS) && !defined(BUILD_VDSO) /* * Force a reference to the external symbol so the compiler generates * __kcfi_typid. diff --git a/init/Kconfig b/init/Kconfig index 836320251219..67f10d8a33b7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2063,8 +2063,8 @@ config RUST depends on !GCC_PLUGIN_RANDSTRUCT depends on !RANDSTRUCT depends on !DEBUG_INFO_BTF || (PAHOLE_HAS_LANG_EXCLUDE && !LTO) - depends on !CFI_CLANG || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC - select CFI_ICALL_NORMALIZE_INTEGERS if CFI_CLANG + depends on !CFI || HAVE_CFI_ICALL_NORMALIZE_INTEGERS_RUSTC + select CFI_ICALL_NORMALIZE_INTEGERS if CFI depends on !CALL_PADDING || RUSTC_VERSION >= 108100 depends on !KASAN_SW_TAGS depends on !(MITIGATION_RETHUNK && KASAN) || RUSTC_VERSION >= 108300 diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235..27e0e6a33610 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -122,7 +122,7 @@ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o -obj-$(CONFIG_CFI_CLANG) += cfi.o +obj-$(CONFIG_CFI) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index 64caaf997fc0..7c3924614e01 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -93,8 +93,8 @@ CONFIG_SECCOMP_FILTER=y # Provides some protections against SYN flooding. CONFIG_SYN_COOKIES=y -# Enable Kernel Control Flow Integrity (currently Clang only). -CONFIG_CFI_CLANG=y +# Enable Kernel Control Flow Integrity. +CONFIG_CFI=y # CONFIG_CFI_PERMISSIVE is not set # Attack surface reduction: do not autoload TTY line disciplines. diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 39278737bb68..2a1beebf1d37 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -460,6 +460,6 @@ config UNUSED_KSYMS_WHITELIST config MODULES_TREE_LOOKUP def_bool y - depends on PERF_EVENTS || TRACING || CFI_CLANG + depends on PERF_EVENTS || TRACING || CFI endif # MODULES diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c index d3204c5c74eb..f8e8c126705c 100644 --- a/kernel/module/tree_lookup.c +++ b/kernel/module/tree_lookup.c @@ -14,7 +14,7 @@ * Use a latched RB-tree for __module_address(); this allows us to use * RCU lookups of the address from any context. * - * This is conditional on PERF_EVENTS || TRACING || CFI_CLANG because those can + * This is conditional on PERF_EVENTS || TRACING || CFI because those can * really hit __module_address() hard by doing a lot of stack unwinding; * potentially from NMI context. */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dc0e0c6ed075..e3e69df19e78 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2894,7 +2894,7 @@ config FORTIFY_KUNIT_TEST config LONGEST_SYM_KUNIT_TEST tristate "Test the longest symbol possible" if !KUNIT_ALL_TESTS depends on KUNIT && KPROBES - depends on !PREFIX_SYMBOLS && !CFI_CLANG && !GCOV_KERNEL + depends on !PREFIX_SYMBOLS && !CFI && !GCOV_KERNEL default KUNIT_ALL_TESTS help Tests the longest symbol possible diff --git a/tools/include/linux/cfi_types.h b/tools/include/linux/cfi_types.h index 6b8713675765..2e098274e45c 100644 --- a/tools/include/linux/cfi_types.h +++ b/tools/include/linux/cfi_types.h @@ -8,7 +8,7 @@ #ifdef __ASSEMBLY__ #include -#ifdef CONFIG_CFI_CLANG +#ifdef CONFIG_CFI /* * Use the __kcfi_typeid_ type identifier symbol to * annotate indirectly called assembly functions. The compiler emits @@ -29,12 +29,12 @@ #define SYM_TYPED_START(name, linkage, align...) \ SYM_TYPED_ENTRY(name, linkage, align) -#else /* CONFIG_CFI_CLANG */ +#else /* CONFIG_CFI */ #define SYM_TYPED_START(name, linkage, align...) \ SYM_START(name, linkage, align) -#endif /* CONFIG_CFI_CLANG */ +#endif /* CONFIG_CFI */ #ifndef SYM_TYPED_FUNC_START #define SYM_TYPED_FUNC_START(name) \ diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h index 89979ca23c3f..34e2fdfe7300 100644 --- a/tools/perf/util/include/linux/linkage.h +++ b/tools/perf/util/include/linux/linkage.h @@ -120,7 +120,7 @@ #endif // In the kernel sources (include/linux/cfi_types.h), this has a different -// definition when CONFIG_CFI_CLANG is used, for tools/ just use the !clang +// definition when CONFIG_CFI is used, for tools/ just use the !cfi // definition: #ifndef SYM_TYPED_START #define SYM_TYPED_START(name, linkage, align...) \ -- cgit v1.2.3 From 340974c4f709ce8142a672ab11f1889dff94d6dc Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 18 Aug 2025 09:39:00 +0530 Subject: mailbox: Add common header for RPMI messages sent via mailbox The RPMI based mailbox controller drivers and mailbox clients need to share defines related to RPMI messages over mailbox interface so add a common header for this purpose. Acked-by: Jassi Brar Co-developed-by: Rahul Pathak Signed-off-by: Rahul Pathak Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250818040920.272664-5-apatel@ventanamicro.com Signed-off-by: Paul Walmsley --- include/linux/mailbox/riscv-rpmi-message.h | 214 +++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 include/linux/mailbox/riscv-rpmi-message.h (limited to 'include/linux') diff --git a/include/linux/mailbox/riscv-rpmi-message.h b/include/linux/mailbox/riscv-rpmi-message.h new file mode 100644 index 000000000000..c3a98fc12c0a --- /dev/null +++ b/include/linux/mailbox/riscv-rpmi-message.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Ventana Micro Systems Inc. */ + +#ifndef _LINUX_RISCV_RPMI_MESSAGE_H_ +#define _LINUX_RISCV_RPMI_MESSAGE_H_ + +#include +#include +#include +#include + +/* RPMI version encode/decode macros */ +#define RPMI_VER_MAJOR(__ver) upper_16_bits(__ver) +#define RPMI_VER_MINOR(__ver) lower_16_bits(__ver) +#define RPMI_MKVER(__maj, __min) (((u32)(__maj) << 16) | (u16)(__min)) + +/* RPMI message header */ +struct rpmi_message_header { + __le16 servicegroup_id; + u8 service_id; + u8 flags; + __le16 datalen; + __le16 token; +}; + +/* RPMI message */ +struct rpmi_message { + struct rpmi_message_header header; + u8 data[]; +}; + +/* RPMI notification event */ +struct rpmi_notification_event { + __le16 event_datalen; + u8 event_id; + u8 reserved; + u8 event_data[]; +}; + +/* RPMI error codes */ +enum rpmi_error_codes { + RPMI_SUCCESS = 0, + RPMI_ERR_FAILED = -1, + RPMI_ERR_NOTSUPP = -2, + RPMI_ERR_INVALID_PARAM = -3, + RPMI_ERR_DENIED = -4, + RPMI_ERR_INVALID_ADDR = -5, + RPMI_ERR_ALREADY = -6, + RPMI_ERR_EXTENSION = -7, + RPMI_ERR_HW_FAULT = -8, + RPMI_ERR_BUSY = -9, + RPMI_ERR_INVALID_STATE = -10, + RPMI_ERR_BAD_RANGE = -11, + RPMI_ERR_TIMEOUT = -12, + RPMI_ERR_IO = -13, + RPMI_ERR_NO_DATA = -14, + RPMI_ERR_RESERVED_START = -15, + RPMI_ERR_RESERVED_END = -127, + RPMI_ERR_VENDOR_START = -128, +}; + +static inline int rpmi_to_linux_error(int rpmi_error) +{ + switch (rpmi_error) { + case RPMI_SUCCESS: + return 0; + case RPMI_ERR_INVALID_PARAM: + case RPMI_ERR_BAD_RANGE: + case RPMI_ERR_INVALID_STATE: + return -EINVAL; + case RPMI_ERR_DENIED: + return -EPERM; + case RPMI_ERR_INVALID_ADDR: + case RPMI_ERR_HW_FAULT: + return -EFAULT; + case RPMI_ERR_ALREADY: + return -EALREADY; + case RPMI_ERR_BUSY: + return -EBUSY; + case RPMI_ERR_TIMEOUT: + return -ETIMEDOUT; + case RPMI_ERR_IO: + return -ECOMM; + case RPMI_ERR_FAILED: + case RPMI_ERR_NOTSUPP: + case RPMI_ERR_NO_DATA: + case RPMI_ERR_EXTENSION: + default: + return -EOPNOTSUPP; + } +} + +/* RPMI Linux mailbox attribute IDs */ +enum rpmi_mbox_attribute_id { + RPMI_MBOX_ATTR_SPEC_VERSION, + RPMI_MBOX_ATTR_MAX_MSG_DATA_SIZE, + RPMI_MBOX_ATTR_SERVICEGROUP_ID, + RPMI_MBOX_ATTR_SERVICEGROUP_VERSION, + RPMI_MBOX_ATTR_IMPL_ID, + RPMI_MBOX_ATTR_IMPL_VERSION, + RPMI_MBOX_ATTR_MAX_ID +}; + +/* RPMI Linux mailbox message types */ +enum rpmi_mbox_message_type { + RPMI_MBOX_MSG_TYPE_GET_ATTRIBUTE, + RPMI_MBOX_MSG_TYPE_SET_ATTRIBUTE, + RPMI_MBOX_MSG_TYPE_SEND_WITH_RESPONSE, + RPMI_MBOX_MSG_TYPE_SEND_WITHOUT_RESPONSE, + RPMI_MBOX_MSG_TYPE_NOTIFICATION_EVENT, + RPMI_MBOX_MSG_MAX_TYPE +}; + +/* RPMI Linux mailbox message instance */ +struct rpmi_mbox_message { + enum rpmi_mbox_message_type type; + union { + struct { + enum rpmi_mbox_attribute_id id; + u32 value; + } attr; + + struct { + u32 service_id; + void *request; + unsigned long request_len; + void *response; + unsigned long max_response_len; + unsigned long out_response_len; + } data; + + struct { + u16 event_datalen; + u8 event_id; + u8 *event_data; + } notif; + }; + int error; +}; + +/* RPMI Linux mailbox message helper routines */ +static inline void rpmi_mbox_init_get_attribute(struct rpmi_mbox_message *msg, + enum rpmi_mbox_attribute_id id) +{ + msg->type = RPMI_MBOX_MSG_TYPE_GET_ATTRIBUTE; + msg->attr.id = id; + msg->attr.value = 0; + msg->error = 0; +} + +static inline void rpmi_mbox_init_set_attribute(struct rpmi_mbox_message *msg, + enum rpmi_mbox_attribute_id id, + u32 value) +{ + msg->type = RPMI_MBOX_MSG_TYPE_SET_ATTRIBUTE; + msg->attr.id = id; + msg->attr.value = value; + msg->error = 0; +} + +static inline void rpmi_mbox_init_send_with_response(struct rpmi_mbox_message *msg, + u32 service_id, + void *request, + unsigned long request_len, + void *response, + unsigned long max_response_len) +{ + msg->type = RPMI_MBOX_MSG_TYPE_SEND_WITH_RESPONSE; + msg->data.service_id = service_id; + msg->data.request = request; + msg->data.request_len = request_len; + msg->data.response = response; + msg->data.max_response_len = max_response_len; + msg->data.out_response_len = 0; + msg->error = 0; +} + +static inline void rpmi_mbox_init_send_without_response(struct rpmi_mbox_message *msg, + u32 service_id, + void *request, + unsigned long request_len) +{ + msg->type = RPMI_MBOX_MSG_TYPE_SEND_WITHOUT_RESPONSE; + msg->data.service_id = service_id; + msg->data.request = request; + msg->data.request_len = request_len; + msg->data.response = NULL; + msg->data.max_response_len = 0; + msg->data.out_response_len = 0; + msg->error = 0; +} + +static inline void *rpmi_mbox_get_msg_response(struct rpmi_mbox_message *msg) +{ + return msg ? msg->data.response : NULL; +} + +static inline int rpmi_mbox_send_message(struct mbox_chan *chan, + struct rpmi_mbox_message *msg) +{ + int ret; + + /* Send message for the underlying mailbox channel */ + ret = mbox_send_message(chan, msg); + if (ret < 0) + return ret; + + /* Explicitly signal txdone for mailbox channel */ + ret = msg->error; + mbox_client_txdone(chan, ret); + return ret; +} + +#endif /* _LINUX_RISCV_RPMI_MESSAGE_H_ */ -- cgit v1.2.3 From f32a26fab3672e60f622bd7461bf978fc72f29ec Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Wed, 24 Sep 2025 16:24:41 -0700 Subject: hfs/hfsplus: rework debug output subsystem Currently, HFS/HFS+ has very obsolete and inconvenient debug output subsystem. Also, the code is duplicated in HFS and HFS+ driver. This patch introduces linux/hfs_common.h for gathering common declarations, inline functions, and common short methods. Currently, this file contains only hfs_dbg() function that employs pr_debug() with the goal to print a debug-level messages conditionally. So, now, it is possible to enable the debug output by means of: echo 'file extent.c +p' > /proc/dynamic_debug/control echo 'func hfsplus_evict_inode +p' > /proc/dynamic_debug/control And debug output looks like this: hfs: pid 5831:fs/hfs/catalog.c:228 hfs_cat_delete(): delete_cat: 00,48 hfs: pid 5831:fs/hfs/extent.c:484 hfs_file_truncate(): truncate: 48, 409600 -> 0 hfs: pid 5831:fs/hfs/extent.c:212 hfs_dump_extent(): hfs: pid 5831:fs/hfs/extent.c:214 hfs_dump_extent(): 78:4 hfs: pid 5831:fs/hfs/extent.c:214 hfs_dump_extent(): 0:0 hfs: pid 5831:fs/hfs/extent.c:214 hfs_dump_extent(): 0:0 v4 Debug messages have been reworked and information about new HFS/HFS+ shared declarations file has been added to MAINTAINERS file. v5 Yangtao Li suggested to clean up debug output and fix several typos. Signed-off-by: Viacheslav Dubeyko cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org cc: Johannes Thumshirn Signed-off-by: Viacheslav Dubeyko --- MAINTAINERS | 2 ++ fs/hfs/bfind.c | 4 ++-- fs/hfs/bitmap.c | 4 ++-- fs/hfs/bnode.c | 28 ++++++++++++++-------------- fs/hfs/brec.c | 8 ++++---- fs/hfs/btree.c | 2 +- fs/hfs/catalog.c | 14 +++++++------- fs/hfs/extent.c | 19 ++++++++++--------- fs/hfs/hfs_fs.h | 33 +-------------------------------- fs/hfs/inode.c | 4 ++-- fs/hfsplus/attributes.c | 8 ++++---- fs/hfsplus/bfind.c | 4 ++-- fs/hfsplus/bitmap.c | 10 +++++----- fs/hfsplus/bnode.c | 28 ++++++++++++++-------------- fs/hfsplus/brec.c | 10 +++++----- fs/hfsplus/btree.c | 4 ++-- fs/hfsplus/catalog.c | 6 +++--- fs/hfsplus/extents.c | 27 +++++++++++++-------------- fs/hfsplus/hfsplus_fs.h | 35 +---------------------------------- fs/hfsplus/super.c | 16 ++++++++++++---- fs/hfsplus/xattr.c | 4 ++-- include/linux/hfs_common.h | 20 ++++++++++++++++++++ 22 files changed, 128 insertions(+), 162 deletions(-) create mode 100644 include/linux/hfs_common.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index f5e1540cb702..83579b15ba43 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10794,6 +10794,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git F: Documentation/filesystems/hfs.rst F: fs/hfs/ +F: include/linux/hfs_common.h HFSPLUS FILESYSTEM M: Viacheslav Dubeyko @@ -10804,6 +10805,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/vdubeyko/hfs.git F: Documentation/filesystems/hfsplus.rst F: fs/hfsplus/ +F: include/linux/hfs_common.h HGA FRAMEBUFFER DRIVER M: Ferenc Bakonyi diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c index e46f650b5e9c..c2f840c49e60 100644 --- a/fs/hfs/bfind.c +++ b/fs/hfs/bfind.c @@ -26,7 +26,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); switch (tree->cnid) { case HFS_CAT_CNID: @@ -48,7 +48,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c index 28307bc9ec1e..5e84833a4743 100644 --- a/fs/hfs/bitmap.c +++ b/fs/hfs/bitmap.c @@ -158,7 +158,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits) } } - hfs_dbg(BITMAP, "alloc_bits: %u,%u\n", pos, *num_bits); + hfs_dbg("pos %u, num_bits %u\n", pos, *num_bits); HFS_SB(sb)->free_ablocks -= *num_bits; hfs_bitmap_dirty(sb); out: @@ -200,7 +200,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count) if (!count) return 0; - hfs_dbg(BITMAP, "clear_bits: %u,%u\n", start, count); + hfs_dbg("start %u, count %u\n", start, count); /* are all of the bits in range? */ if ((start + count) > HFS_SB(sb)->fs_ablocks) return -2; diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index e8cd1a31f247..fcfffe75d84e 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -200,7 +200,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, { struct page *src_page, *dst_page; - hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -221,7 +221,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) struct page *page; void *ptr; - hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -243,16 +243,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg("node %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - hfs_dbg_cont(BNODE_MOD, " %d", key_off); + hfs_dbg(" key_off %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -260,18 +260,18 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = (hfs_bnode_read_u8(node, key_off) | 1) + 1; else tmp = node->tree->max_key_len + 1; - hfs_dbg_cont(BNODE_MOD, " (%d,%d", - tmp, hfs_bnode_read_u8(node, key_off)); + hfs_dbg(" (%d,%d", + tmp, hfs_bnode_read_u8(node, key_off)); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg(", cnid %d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u8(node, key_off); - hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + hfs_dbg(" (%d)", tmp); } } - hfs_dbg_cont(BNODE_MOD, "\n"); + hfs_dbg("\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -361,7 +361,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + hfs_dbg("cnid %d, node %d, refcnt 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); @@ -401,7 +401,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -546,7 +546,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -559,7 +559,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfs/brec.c b/fs/hfs/brec.c index b01db1fae147..e49a141c87e5 100644 --- a/fs/hfs/brec.c +++ b/fs/hfs/brec.c @@ -94,7 +94,7 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -192,7 +192,7 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg("rec %d, len %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -261,7 +261,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg("this %d, new %d, next %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -397,7 +397,7 @@ again: newkeylen = (hfs_bnode_read_u8(node, 14) | 1) + 1; else fd->keylength = newkeylen = tree->max_key_len + 1; - hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg("rec %d, keylength %d, newkeylen %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index e86e1e235658..22e62fe7448b 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -364,7 +364,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg("node %u\n", node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index b6b18ee68135..caebabb6642f 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -87,7 +87,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i int entry_size; int err; - hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg("name %s, cnid %u, i_nlink %d\n", str->name, cnid, inode->i_nlink); if (dir->i_size >= HFS_MAX_VALENCE) return -ENOSPC; @@ -238,7 +238,7 @@ int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) s64 leaf_tail; s64 node_id; - hfs_dbg(CAT_MOD, "correct next unused CNID: cnid %u, next_id %lld\n", + hfs_dbg("cnid %u, next_id %lld\n", cnid, atomic64_read(&HFS_SB(sb)->next_id)); if ((cnid + 1) < atomic64_read(&HFS_SB(sb)->next_id)) { @@ -274,7 +274,7 @@ int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) return -ENOENT; } - hfs_dbg(CAT_MOD, "node_id %lld, leaf_tail %lld, leaf_head %lld\n", + hfs_dbg("node %lld, leaf_tail %lld, leaf_head %lld\n", node_id, leaf_tail, leaf_head); hfs_bnode_dump(node); @@ -309,13 +309,13 @@ int hfs_correct_next_unused_CNID(struct super_block *sb, u32 cnid) if (rec.type == HFS_CDR_DIR) { found_cnid = be32_to_cpu(rec.dir.DirID); - hfs_dbg(CAT_MOD, "found_cnid %u\n", found_cnid); + hfs_dbg("found_cnid %u\n", found_cnid); hfs_set_next_unused_CNID(sb, cnid, found_cnid); hfs_bnode_put(node); return 0; } else if (rec.type == HFS_CDR_FIL) { found_cnid = be32_to_cpu(rec.file.FlNum); - hfs_dbg(CAT_MOD, "found_cnid %u\n", found_cnid); + hfs_dbg("found_cnid %u\n", found_cnid); hfs_set_next_unused_CNID(sb, cnid, found_cnid); hfs_bnode_put(node); return 0; @@ -343,7 +343,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) struct hfs_readdir_data *rd; int res, type; - hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid); sb = dir->i_sb; res = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (res) @@ -417,7 +417,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, int entry_size, type; int err; - hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg("cnid %u - (ino %lu, name %s) - (ino %lu, name %s)\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); sb = src_dir->i_sb; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index 580c62981dbd..a097908b269d 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -209,12 +209,12 @@ static void hfs_dump_extent(struct hfs_extent *extent) { int i; - hfs_dbg(EXTENT, " "); + hfs_dbg("extent: "); for (i = 0; i < 3; i++) - hfs_dbg_cont(EXTENT, " %u:%u", - be16_to_cpu(extent[i].block), - be16_to_cpu(extent[i].count)); - hfs_dbg_cont(EXTENT, "\n"); + hfs_dbg(" block %u, count %u", + be16_to_cpu(extent[i].block), + be16_to_cpu(extent[i].count)); + hfs_dbg("\n"); } static int hfs_add_extent(struct hfs_extent *extent, u16 offset, @@ -411,10 +411,11 @@ int hfs_extend_file(struct inode *inode) goto out; } - hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { - hfs_dbg(EXTENT, "first extents\n"); + hfs_dbg("first_extent: start %u, len %u\n", + start, len); /* no extents yet */ HFS_I(inode)->first_extents[0].block = cpu_to_be16(start); HFS_I(inode)->first_extents[0].count = cpu_to_be16(len); @@ -456,7 +457,7 @@ out: return res; insert_extent: - hfs_dbg(EXTENT, "insert new extent\n"); + hfs_dbg("insert new extent\n"); res = hfs_ext_write_extent(inode); if (res) goto out; @@ -481,7 +482,7 @@ void hfs_file_truncate(struct inode *inode) u32 size; int res; - hfs_dbg(INODE, "truncate: %lu, %Lu -> %Lu\n", + hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)HFS_I(inode)->phys_size, inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index c732a066de78..fff149af89da 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -9,12 +9,6 @@ #ifndef _LINUX_HFS_FS_H #define _LINUX_HFS_FS_H -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include @@ -24,35 +18,10 @@ #include #include +#include #include "hfs.h" -#define DBG_BNODE_REFS 0x00000001 -#define DBG_BNODE_MOD 0x00000002 -#define DBG_CAT_MOD 0x00000004 -#define DBG_INODE 0x00000008 -#define DBG_SUPER 0x00000010 -#define DBG_EXTENT 0x00000020 -#define DBG_BITMAP 0x00000040 - -//#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD|DBG_CAT_MOD|DBG_BITMAP) -//#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -//#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) -#define DBG_MASK (0) - -#define hfs_dbg(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - -#define hfs_dbg_cont(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - pr_cont(fmt, ##__VA_ARGS__); \ -} while (0) - - /* * struct hfs_inode_info * diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 490b6ad5d981..9cd449913dc8 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -249,7 +249,7 @@ void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; - hfs_dbg(INODE, "delete_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { BUG_ON(atomic64_read(&HFS_SB(sb)->folder_count) > U32_MAX); atomic64_dec(&HFS_SB(sb)->folder_count); @@ -436,7 +436,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_cat_rec rec; int res; - hfs_dbg(INODE, "hfs_write_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); res = hfs_ext_write_extent(inode); if (res) return res; diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index eeebe80c6be4..ba26980cc503 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -139,7 +139,7 @@ int hfsplus_find_attr(struct super_block *sb, u32 cnid, { int err = 0; - hfs_dbg(ATTR_MOD, "find_attr: %s,%d\n", name ? name : NULL, cnid); + hfs_dbg("name %s, cnid %d\n", name ? name : NULL, cnid); if (!HFSPLUS_SB(sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); @@ -201,7 +201,7 @@ int hfsplus_create_attr(struct inode *inode, int entry_size; int err; - hfs_dbg(ATTR_MOD, "create_attr: %s,%ld\n", + hfs_dbg("name %s, ino %ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -310,7 +310,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) struct super_block *sb = inode->i_sb; struct hfs_find_data fd; - hfs_dbg(ATTR_MOD, "delete_attr: %s,%ld\n", + hfs_dbg("name %s, ino %ld\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -356,7 +356,7 @@ int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid) int err = 0; struct hfs_find_data fd; - hfs_dbg(ATTR_MOD, "delete_all_attrs: %d\n", cnid); + hfs_dbg("cnid %d\n", cnid); if (!HFSPLUS_SB(dir->i_sb)->attr_tree) { pr_err("attributes file doesn't exist\n"); diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index 26ebac4c6042..afc9c89e8c6a 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; - hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", tree->cnid, __builtin_return_address(0)); mutex_lock_nested(&tree->tree_lock, hfsplus_btree_lock_class(tree)); @@ -34,7 +34,7 @@ void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); - hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", + hfs_dbg("cnid %d, caller %ps\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index bd8dcea85588..1b3af8c87cad 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -31,7 +31,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, if (!len) return size; - hfs_dbg(BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); + hfs_dbg("size %u, offset %u, len %u\n", size, offset, len); mutex_lock(&sbi->alloc_mutex); mapping = sbi->alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); @@ -90,14 +90,14 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, else end = pptr + ((size + 31) & (PAGE_CACHE_BITS - 1)) / 32; } - hfs_dbg(BITMAP, "bitmap full\n"); + hfs_dbg("bitmap full\n"); start = size; goto out; found: start = offset + (curr - pptr) * 32 + i; if (start >= size) { - hfs_dbg(BITMAP, "bitmap full\n"); + hfs_dbg("bitmap full\n"); goto out; } /* do any partial u32 at the start */ @@ -155,7 +155,7 @@ done: *max = offset + (curr - pptr) * 32 + i - start; sbi->free_blocks -= *max; hfsplus_mark_mdb_dirty(sb); - hfs_dbg(BITMAP, "-> %u,%u\n", start, *max); + hfs_dbg("start %u, max %u\n", start, *max); out: mutex_unlock(&sbi->alloc_mutex); return start; @@ -174,7 +174,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if (!count) return 0; - hfs_dbg(BITMAP, "block_free: %u,%u\n", offset, count); + hfs_dbg("offset %u, count %u\n", offset, count); /* are all of the bits in range? */ if ((offset + count) > sbi->total_blocks) return -ENOENT; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 407d5152eb41..63e652ad1e0d 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -173,7 +173,7 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct page **src_page, **dst_page; int l; - hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -231,7 +231,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len) void *src_ptr, *dst_ptr; int l; - hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len); + hfs_dbg("dst %u, src %u, len %u\n", dst, src, len); if (!len) return; @@ -351,16 +351,16 @@ void hfs_bnode_dump(struct hfs_bnode *node) __be32 cnid; int i, off, key_off; - hfs_dbg(BNODE_MOD, "bnode: %d\n", node->this); + hfs_dbg("node %d\n", node->this); hfs_bnode_read(node, &desc, 0, sizeof(desc)); - hfs_dbg(BNODE_MOD, "%d, %d, %d, %d, %d\n", + hfs_dbg("next %d, prev %d, type %d, height %d, num_recs %d\n", be32_to_cpu(desc.next), be32_to_cpu(desc.prev), desc.type, desc.height, be16_to_cpu(desc.num_recs)); off = node->tree->node_size - 2; for (i = be16_to_cpu(desc.num_recs); i >= 0; off -= 2, i--) { key_off = hfs_bnode_read_u16(node, off); - hfs_dbg(BNODE_MOD, " %d", key_off); + hfs_dbg(" key_off %d", key_off); if (i && node->type == HFS_NODE_INDEX) { int tmp; @@ -369,17 +369,17 @@ void hfs_bnode_dump(struct hfs_bnode *node) tmp = hfs_bnode_read_u16(node, key_off) + 2; else tmp = node->tree->max_key_len + 2; - hfs_dbg_cont(BNODE_MOD, " (%d", tmp); + hfs_dbg(" (%d", tmp); hfs_bnode_read(node, &cnid, key_off + tmp, 4); - hfs_dbg_cont(BNODE_MOD, ",%d)", be32_to_cpu(cnid)); + hfs_dbg(", cnid %d)", be32_to_cpu(cnid)); } else if (i && node->type == HFS_NODE_LEAF) { int tmp; tmp = hfs_bnode_read_u16(node, key_off); - hfs_dbg_cont(BNODE_MOD, " (%d)", tmp); + hfs_dbg(" (%d)", tmp); } } - hfs_dbg_cont(BNODE_MOD, "\n"); + hfs_dbg("\n"); } void hfs_bnode_unlink(struct hfs_bnode *node) @@ -415,7 +415,7 @@ void hfs_bnode_unlink(struct hfs_bnode *node) /* move down? */ if (!node->prev && !node->next) - hfs_dbg(BNODE_MOD, "hfs_btree_del_level\n"); + hfs_dbg("btree delete level\n"); if (!node->parent) { tree->root = 0; tree->depth = 0; @@ -470,7 +470,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) node->this = cnid; set_bit(HFS_BNODE_NEW, &node->flags); atomic_set(&node->refcnt, 1); - hfs_dbg(BNODE_REFS, "new_node(%d:%d): 1\n", + hfs_dbg("cnid %d, node %d, refcnt 1\n", node->tree->cnid, node->this); init_waitqueue_head(&node->lock_wq); spin_lock(&tree->hash_lock); @@ -510,7 +510,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) { struct hfs_bnode **p; - hfs_dbg(BNODE_REFS, "remove_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) @@ -656,7 +656,7 @@ void hfs_bnode_get(struct hfs_bnode *node) { if (node) { atomic_inc(&node->refcnt); - hfs_dbg(BNODE_REFS, "get_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); } @@ -669,7 +669,7 @@ void hfs_bnode_put(struct hfs_bnode *node) struct hfs_btree *tree = node->tree; int i; - hfs_dbg(BNODE_REFS, "put_node(%d:%d): %d\n", + hfs_dbg("cnid %d, node %d, refcnt %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); BUG_ON(!atomic_read(&node->refcnt)); diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 1918544a7871..b4645102feec 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -92,7 +92,7 @@ again: end_rec_off = tree->node_size - (node->num_recs + 1) * 2; end_off = hfs_bnode_read_u16(node, end_rec_off); end_rec_off -= 2; - hfs_dbg(BNODE_MOD, "insert_rec: %d, %d, %d, %d\n", + hfs_dbg("rec %d, size %d, end_off %d, end_rec_off %d\n", rec, size, end_off, end_rec_off); if (size > end_rec_off - end_off) { if (new_node) @@ -193,7 +193,7 @@ again: mark_inode_dirty(tree->inode); } hfs_bnode_dump(node); - hfs_dbg(BNODE_MOD, "remove_rec: %d, %d\n", + hfs_dbg("rec %d, len %d\n", fd->record, fd->keylength + fd->entrylength); if (!--node->num_recs) { hfs_bnode_unlink(node); @@ -246,7 +246,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) if (IS_ERR(new_node)) return new_node; hfs_bnode_get(node); - hfs_dbg(BNODE_MOD, "split_nodes: %d - %d - %d\n", + hfs_dbg("this %d - new %d - next %d\n", node->this, new_node->this, node->next); new_node->next = node->next; new_node->prev = node->this; @@ -383,7 +383,7 @@ again: newkeylen = hfs_bnode_read_u16(node, 14) + 2; else fd->keylength = newkeylen = tree->max_key_len + 2; - hfs_dbg(BNODE_MOD, "update_rec: %d, %d, %d\n", + hfs_dbg("rec %d, keylength %d, newkeylen %d\n", rec, fd->keylength, newkeylen); rec_off = tree->node_size - (rec + 2) * 2; @@ -395,7 +395,7 @@ again: end_off = hfs_bnode_read_u16(parent, end_rec_off); if (end_rec_off - end_off < diff) { - hfs_dbg(BNODE_MOD, "splitting index node\n"); + hfs_dbg("splitting index node\n"); fd->bnode = parent; new_node = hfs_bnode_split(fd); if (IS_ERR(new_node)) diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index fe6a54c4083c..7cc5aea14572 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -434,7 +434,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) kunmap_local(data); nidx = node->next; if (!nidx) { - hfs_dbg(BNODE_MOD, "create new bmap node\n"); + hfs_dbg("create new bmap node\n"); next_node = hfs_bmap_new_bmap(node, idx); } else next_node = hfs_bnode_find(tree, nidx); @@ -460,7 +460,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u32 nidx; u8 *data, byte, m; - hfs_dbg(BNODE_MOD, "btree_free_node: %u\n", node->this); + hfs_dbg("node %u\n", node->this); BUG_ON(!node->this); tree = node->tree; nidx = node->this; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 1995bafee839..02c1eee4a4b8 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -259,7 +259,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, int entry_size; int err; - hfs_dbg(CAT_MOD, "create_cat: %s,%u(%d)\n", + hfs_dbg("name %s, cnid %u, i_nlink %d\n", str->name, cnid, inode->i_nlink); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) @@ -336,7 +336,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) int err, off; u16 type; - hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); + hfs_dbg("name %s, cnid %u\n", str ? str->name : NULL, cnid); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return err; @@ -441,7 +441,7 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err; - hfs_dbg(CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", + hfs_dbg("cnid %u - ino %lu, name %s - ino %lu, name %s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index b1699b3c246a..8e886514d27f 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -275,7 +275,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, mutex_unlock(&hip->extents_lock); done: - hfs_dbg(EXTENT, "get_block(%lu): %llu - %u\n", + hfs_dbg("ino %lu, iblock %llu - dblock %u\n", inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; @@ -298,12 +298,12 @@ static void hfsplus_dump_extent(struct hfsplus_extent *extent) { int i; - hfs_dbg(EXTENT, " "); + hfs_dbg("extent "); for (i = 0; i < 8; i++) - hfs_dbg_cont(EXTENT, " %u:%u", - be32_to_cpu(extent[i].start_block), - be32_to_cpu(extent[i].block_count)); - hfs_dbg_cont(EXTENT, "\n"); + hfs_dbg(" start_block %u, block_count %u", + be32_to_cpu(extent[i].start_block), + be32_to_cpu(extent[i].block_count)); + hfs_dbg("\n"); } static int hfsplus_add_extent(struct hfsplus_extent *extent, u32 offset, @@ -359,8 +359,7 @@ found: if (count <= block_nr) { err = hfsplus_block_free(sb, start, count); if (err) { - pr_err("can't free extent\n"); - hfs_dbg(EXTENT, " start: %u count: %u\n", + pr_err("can't free extent: start %u, count %u\n", start, count); } extent->block_count = 0; @@ -370,8 +369,7 @@ found: count -= block_nr; err = hfsplus_block_free(sb, start + count, block_nr); if (err) { - pr_err("can't free extent\n"); - hfs_dbg(EXTENT, " start: %u count: %u\n", + pr_err("can't free extent: start %u, count %u\n", start, count); } extent->block_count = cpu_to_be32(count); @@ -478,11 +476,12 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) goto out; } - hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); + hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { if (!hip->first_blocks) { - hfs_dbg(EXTENT, "first extents\n"); + hfs_dbg("first_extent: start %u, len %u\n", + start, len); /* no extents yet */ hip->first_extents[0].start_block = cpu_to_be32(start); hip->first_extents[0].block_count = cpu_to_be32(len); @@ -521,7 +520,7 @@ out: return res; insert_extent: - hfs_dbg(EXTENT, "insert new extent\n"); + hfs_dbg("insert new extent\n"); res = hfsplus_ext_write_extent_locked(inode); if (res) goto out; @@ -546,7 +545,7 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - hfs_dbg(INODE, "truncate: %lu, %llu -> %llu\n", + hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)hip->phys_size, inode->i_size); if (inode->i_size > hip->phys_size) { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 9dd18de0bc89..89e8b19c127b 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -11,47 +11,14 @@ #ifndef _LINUX_HFSPLUS_FS_H #define _LINUX_HFSPLUS_FS_H -#ifdef pr_fmt -#undef pr_fmt -#endif - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include #include #include #include #include +#include #include "hfsplus_raw.h" -#define DBG_BNODE_REFS 0x00000001 -#define DBG_BNODE_MOD 0x00000002 -#define DBG_CAT_MOD 0x00000004 -#define DBG_INODE 0x00000008 -#define DBG_SUPER 0x00000010 -#define DBG_EXTENT 0x00000020 -#define DBG_BITMAP 0x00000040 -#define DBG_ATTR_MOD 0x00000080 - -#if 0 -#define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) -#define DBG_MASK (DBG_BNODE_MOD|DBG_CAT_MOD|DBG_INODE) -#define DBG_MASK (DBG_CAT_MOD|DBG_BNODE_REFS|DBG_INODE|DBG_EXTENT) -#endif -#define DBG_MASK (0) - -#define hfs_dbg(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \ -} while (0) - -#define hfs_dbg_cont(flg, fmt, ...) \ -do { \ - if (DBG_##flg & DBG_MASK) \ - pr_cont(fmt, ##__VA_ARGS__); \ -} while (0) - /* Runtime config options */ #define HFSPLUS_DEF_CR_TYPE 0x3F3F3F3F /* '????' */ diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 77ec048021a0..16bc4abc67e0 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -163,7 +163,7 @@ static int hfsplus_write_inode(struct inode *inode, { int err; - hfs_dbg(INODE, "hfsplus_write_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) @@ -178,7 +178,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { - hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino); + hfs_dbg("ino %lu\n", inode->i_ino); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { @@ -197,7 +197,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - hfs_dbg(SUPER, "hfsplus_sync_fs\n"); + hfs_dbg("starting...\n"); /* * Explicitly write out the special metadata inodes. @@ -228,6 +228,10 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) vhdr->folder_count = cpu_to_be32(sbi->folder_count); vhdr->file_count = cpu_to_be32(sbi->file_count); + hfs_dbg("free_blocks %u, next_cnid %u, folder_count %u, file_count %u\n", + sbi->free_blocks, sbi->next_cnid, + sbi->folder_count, sbi->file_count); + if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) { memcpy(sbi->s_backup_vhdr, sbi->s_vhdr, sizeof(*sbi->s_vhdr)); write_backup = 1; @@ -253,6 +257,8 @@ out: if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(sb->s_bdev); + hfs_dbg("finished: err %d\n", error); + return error; } @@ -301,7 +307,7 @@ static void hfsplus_put_super(struct super_block *sb) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); - hfs_dbg(SUPER, "hfsplus_put_super\n"); + hfs_dbg("starting...\n"); cancel_delayed_work_sync(&sbi->sync_work); @@ -323,6 +329,8 @@ static void hfsplus_put_super(struct super_block *sb) kfree(sbi->s_vhdr_buf); kfree(sbi->s_backup_vhdr_buf); call_rcu(&sbi->rcu, delayed_free); + + hfs_dbg("finished\n"); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index c951fa9835aa..ece4d29c0ab9 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -64,7 +64,7 @@ static void hfsplus_init_header_node(struct inode *attr_file, u32 used_bmp_bytes; u64 tmp; - hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n", + hfs_dbg("clump %u, node_size %u\n", clump_size, node_size); /* The end of the node contains list of record offsets */ @@ -132,7 +132,7 @@ static int hfsplus_create_attributes_file(struct super_block *sb) struct page *page; int old_state = HFSPLUS_EMPTY_ATTR_TREE; - hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID); + hfs_dbg("ino %d\n", HFSPLUS_ATTR_CNID); check_attr_tree_state_again: switch (atomic_read(&sbi->attr_tree_state)) { diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h new file mode 100644 index 000000000000..8838ca2f3d08 --- /dev/null +++ b/include/linux/hfs_common.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * HFS/HFS+ common definitions, inline functions, + * and shared functionality. + */ + +#ifndef _HFS_COMMON_H_ +#define _HFS_COMMON_H_ + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define hfs_dbg(fmt, ...) \ + pr_debug("pid %d:%s:%d %s(): " fmt, \ + current->pid, __FILE__, __LINE__, __func__, ##__VA_ARGS__) \ + +#endif /* _HFS_COMMON_H_ */ -- cgit v1.2.3 From ba879dfc0574878f3e08f217b2b4fdf845c426c0 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 18 Aug 2025 09:39:01 +0530 Subject: mailbox: Allow controller specific mapping using fwnode Introduce optional fw_node() callback which allows a mailbox controller driver to provide controller specific mapping using fwnode. The Linux OF framework already implements fwnode operations for the Linux DD framework so the fw_xlate() callback works fine with device tree as well. Acked-by: Jassi Brar Reviewed-by: Andy Shevchenko Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250818040920.272664-6-apatel@ventanamicro.com Signed-off-by: Paul Walmsley --- drivers/mailbox/mailbox.c | 65 +++++++++++++++++++++++--------------- include/linux/mailbox_controller.h | 3 ++ 2 files changed, 43 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index 5cd8ae222073..2acc6ec229a4 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "mailbox.h" @@ -383,34 +384,56 @@ EXPORT_SYMBOL_GPL(mbox_bind_client); */ struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index) { - struct device *dev = cl->dev; + struct fwnode_reference_args fwspec; + struct fwnode_handle *fwnode; struct mbox_controller *mbox; struct of_phandle_args spec; struct mbox_chan *chan; + struct device *dev; + unsigned int i; int ret; - if (!dev || !dev->of_node) { - pr_debug("%s: No owner device node\n", __func__); + dev = cl->dev; + if (!dev) { + pr_debug("No owner device\n"); return ERR_PTR(-ENODEV); } - ret = of_parse_phandle_with_args(dev->of_node, "mboxes", "#mbox-cells", - index, &spec); + fwnode = dev_fwnode(dev); + if (!fwnode) { + dev_dbg(dev, "No owner fwnode\n"); + return ERR_PTR(-ENODEV); + } + + ret = fwnode_property_get_reference_args(fwnode, "mboxes", "#mbox-cells", + 0, index, &fwspec); if (ret) { - dev_err(dev, "%s: can't parse \"mboxes\" property\n", __func__); + dev_err(dev, "%s: can't parse \"%s\" property\n", __func__, "mboxes"); return ERR_PTR(ret); } + spec.np = to_of_node(fwspec.fwnode); + spec.args_count = fwspec.nargs; + for (i = 0; i < spec.args_count; i++) + spec.args[i] = fwspec.args[i]; + scoped_guard(mutex, &con_mutex) { chan = ERR_PTR(-EPROBE_DEFER); - list_for_each_entry(mbox, &mbox_cons, node) - if (mbox->dev->of_node == spec.np) { - chan = mbox->of_xlate(mbox, &spec); - if (!IS_ERR(chan)) - break; + list_for_each_entry(mbox, &mbox_cons, node) { + if (device_match_fwnode(mbox->dev, fwspec.fwnode)) { + if (mbox->fw_xlate) { + chan = mbox->fw_xlate(mbox, &fwspec); + if (!IS_ERR(chan)) + break; + } else if (mbox->of_xlate) { + chan = mbox->of_xlate(mbox, &spec); + if (!IS_ERR(chan)) + break; + } } + } - of_node_put(spec.np); + fwnode_handle_put(fwspec.fwnode); if (IS_ERR(chan)) return chan; @@ -427,15 +450,8 @@ EXPORT_SYMBOL_GPL(mbox_request_channel); struct mbox_chan *mbox_request_channel_byname(struct mbox_client *cl, const char *name) { - struct device_node *np = cl->dev->of_node; - int index; - - if (!np) { - dev_err(cl->dev, "%s() currently only supports DT\n", __func__); - return ERR_PTR(-EINVAL); - } + int index = device_property_match_string(cl->dev, "mbox-names", name); - index = of_property_match_string(np, "mbox-names", name); if (index < 0) { dev_err(cl->dev, "%s() could not locate channel named \"%s\"\n", __func__, name); @@ -470,9 +486,8 @@ void mbox_free_channel(struct mbox_chan *chan) } EXPORT_SYMBOL_GPL(mbox_free_channel); -static struct mbox_chan * -of_mbox_index_xlate(struct mbox_controller *mbox, - const struct of_phandle_args *sp) +static struct mbox_chan *fw_mbox_index_xlate(struct mbox_controller *mbox, + const struct fwnode_reference_args *sp) { int ind = sp->args[0]; @@ -523,8 +538,8 @@ int mbox_controller_register(struct mbox_controller *mbox) spin_lock_init(&chan->lock); } - if (!mbox->of_xlate) - mbox->of_xlate = of_mbox_index_xlate; + if (!mbox->fw_xlate && !mbox->of_xlate) + mbox->fw_xlate = fw_mbox_index_xlate; scoped_guard(mutex, &con_mutex) list_add_tail(&mbox->node, &mbox_cons); diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index ad01c4082358..80a427c7ca29 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -66,6 +66,7 @@ struct mbox_chan_ops { * no interrupt rises. Ignored if 'txdone_irq' is set. * @txpoll_period: If 'txdone_poll' is in effect, the API polls for * last TX's status after these many millisecs + * @fw_xlate: Controller driver specific mapping of channel via fwnode * @of_xlate: Controller driver specific mapping of channel via DT * @poll_hrt: API private. hrtimer used to poll for TXDONE on all * channels. @@ -79,6 +80,8 @@ struct mbox_controller { bool txdone_irq; bool txdone_poll; unsigned txpoll_period; + struct mbox_chan *(*fw_xlate)(struct mbox_controller *mbox, + const struct fwnode_reference_args *sp); struct mbox_chan *(*of_xlate)(struct mbox_controller *mbox, const struct of_phandle_args *sp); /* Internal to API */ -- cgit v1.2.3 From 6f01c24f3a7525e953d514367a6d91b39c8f1f6a Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 18 Aug 2025 09:39:02 +0530 Subject: byteorder: Add memcpy_to_le32() and memcpy_from_le32() Add common memcpy APIs for copying u32 array to/from __le32 array. Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Signed-off-by: Anup Patel Reviewed-by: Linus Walleij Acked-by: Jassi Brar Link: https://lore.kernel.org/r/20250818040920.272664-7-apatel@ventanamicro.com Signed-off-by: Paul Walmsley --- include/linux/byteorder/generic.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index c9a4c96c9943..b3705e8bbe2b 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -173,6 +173,22 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words) } } +static inline void memcpy_from_le32(u32 *dst, const __le32 *src, size_t words) +{ + size_t i; + + for (i = 0; i < words; i++) + dst[i] = le32_to_cpu(src[i]); +} + +static inline void memcpy_to_le32(__le32 *dst, const u32 *src, size_t words) +{ + size_t i; + + for (i = 0; i < words; i++) + dst[i] = cpu_to_le32(src[i]); +} + static inline void be16_add_cpu(__be16 *var, u16 val) { *var = cpu_to_be16(be16_to_cpu(*var) + val); -- cgit v1.2.3 From edcc8a38b5ac1a3dbd05e113a38a25b937ebefe5 Mon Sep 17 00:00:00 2001 From: Qi Xi Date: Tue, 9 Sep 2025 19:29:10 +0800 Subject: once: fix race by moving DO_ONCE to separate section The commit c2c60ea37e5b ("once: use __section(".data.once")") moved DO_ONCE's ___done variable to .data.once section, which conflicts with DO_ONCE_LITE() that also uses the same section. This creates a race condition when clear_warn_once is used: Thread 1 (DO_ONCE) Thread 2 (DO_ONCE) __do_once_start read ___done (false) acquire once_lock execute func __do_once_done write ___done (true) __do_once_start release once_lock // Thread 3 clear_warn_once reset ___done read ___done (false) acquire once_lock execute func schedule once_work __do_once_done once_deferred: OK write ___done (true) static_branch_disable release once_lock schedule once_work once_deferred: BUG_ON(!static_key_enabled) DO_ONCE_LITE() in once_lite.h is used by WARN_ON_ONCE() and other warning macros. Keep its ___done flag in the .data..once section and allow resetting by clear_warn_once, as originally intended. In contrast, DO_ONCE() is used for functions like get_random_once() and relies on its ___done flag for internal synchronization. We should not reset DO_ONCE() by clear_warn_once. Fix it by isolating DO_ONCE's ___done into a separate .data..do_once section, shielding it from clear_warn_once. Fixes: c2c60ea37e5b ("once: use __section(".data.once")") Reported-by: Hulk Robot Signed-off-by: Qi Xi Signed-off-by: Arnd Bergmann --- include/asm-generic/vmlinux.lds.h | 1 + include/linux/once.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ae2d2359b79e..8efbe8c4874e 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -361,6 +361,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) __start_once = .; \ *(.data..once) \ __end_once = .; \ + *(.data..do_once) \ STRUCT_ALIGN(); \ *(__tracepoints) \ /* implement dynamic printk debug */ \ diff --git a/include/linux/once.h b/include/linux/once.h index 30346fcdc799..449a0e34ad5a 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -46,7 +46,7 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, #define DO_ONCE(func, ...) \ ({ \ bool ___ret = false; \ - static bool __section(".data..once") ___done = false; \ + static bool __section(".data..do_once") ___done = false; \ static DEFINE_STATIC_KEY_TRUE(___once_key); \ if (static_branch_unlikely(&___once_key)) { \ unsigned long ___flags; \ @@ -64,7 +64,7 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, #define DO_ONCE_SLEEPABLE(func, ...) \ ({ \ bool ___ret = false; \ - static bool __section(".data..once") ___done = false; \ + static bool __section(".data..do_once") ___done = false; \ static DEFINE_STATIC_KEY_TRUE(___once_key); \ if (static_branch_unlikely(&___once_key)) { \ ___ret = __do_once_sleepable_start(&___done); \ -- cgit v1.2.3 From 5295be6c4ea4e1ffd38ab0ab131a65afc6b78e9f Mon Sep 17 00:00:00 2001 From: Vitaly Margolin Date: Sun, 23 Jun 2024 09:19:15 +0300 Subject: accel/habanalabs: add generic message type to get error counters Add a new CPUCP generic message type to retrieve HBM, SRAM and critical error counters from the device. Signed-off-by: Vitaly Margolin Reviewed-by: Koby Elbaz Signed-off-by: Koby Elbaz --- drivers/accel/habanalabs/common/habanalabs_ioctl.c | 3 +++ include/linux/habanalabs/cpucp_if.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index dc80ca921d90..ed0dee5aa9be 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -961,6 +961,9 @@ static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args * case HL_PASSTHROUGH_VERSIONS: need_input_buff = false; break; + case HL_GET_ERR_COUNTERS_CMD: + need_input_buff = true; + break; default: return -EINVAL; } diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 7ed3fdd55dda..29c50e7427d1 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -1425,9 +1425,11 @@ struct cpucp_monitor_dump { * from "pkt_subidx" field in struct cpucp_packet. * * HL_PASSTHROUGHT_VERSIONS - Fetch all firmware versions. + * HL_GET_ERR_COUNTERS_CMD - Command to get error counters */ enum hl_passthrough_type { HL_PASSTHROUGH_VERSIONS, + HL_GET_ERR_COUNTERS_CMD, }; #endif /* CPUCP_IF_H */ -- cgit v1.2.3 From 65a3f5bc331ca7384900641b46cadff63805c10e Mon Sep 17 00:00:00 2001 From: Ariel Aviad Date: Tue, 24 Sep 2024 11:02:49 +0300 Subject: accel/habanalabs: add HL_GET_P_STATE passthrough type Add a new passthrough type HL_GET_P_STATE to the cpucp generic ioctl to allow userspace to read the device performance state via firmware. Signed-off-by: Ariel Aviad Reviewed-by: Koby Elbaz Signed-off-by: Koby Elbaz --- drivers/accel/habanalabs/common/habanalabs_ioctl.c | 3 +++ include/linux/habanalabs/cpucp_if.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c index ed0dee5aa9be..fdfdabc85e54 100644 --- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c @@ -964,6 +964,9 @@ static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args * case HL_GET_ERR_COUNTERS_CMD: need_input_buff = true; break; + case HL_GET_P_STATE: + need_input_buff = false; + break; default: return -EINVAL; } diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 29c50e7427d1..45f181bcf890 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -1426,10 +1426,12 @@ struct cpucp_monitor_dump { * * HL_PASSTHROUGHT_VERSIONS - Fetch all firmware versions. * HL_GET_ERR_COUNTERS_CMD - Command to get error counters + * HL_GET_P_STATE - get performance state */ enum hl_passthrough_type { HL_PASSTHROUGH_VERSIONS, HL_GET_ERR_COUNTERS_CMD, + HL_GET_P_STATE, }; #endif /* CPUCP_IF_H */ -- cgit v1.2.3 From 10cdfcd37ade7ce736bc4a1927680f390a6b1f7b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:33:58 +0200 Subject: nstree: make struct ns_tree private Don't expose it directly. There's no need to do that. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/nstree.h | 13 ------------- kernel/nstree.c | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 29ad6402260c..8b8636690473 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -9,19 +9,6 @@ #include #include -/** - * struct ns_tree - Namespace tree - * @ns_tree: Rbtree of namespaces of a particular type - * @ns_list: Sequentially walkable list of all namespaces of this type - * @ns_tree_lock: Seqlock to protect the tree and list - */ -struct ns_tree { - struct rb_root ns_tree; - struct list_head ns_list; - seqlock_t ns_tree_lock; - int type; -}; - extern struct ns_tree cgroup_ns_tree; extern struct ns_tree ipc_ns_tree; extern struct ns_tree mnt_ns_tree; diff --git a/kernel/nstree.c b/kernel/nstree.c index bbe8bedc924c..ecc88b013eff 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -4,6 +4,20 @@ #include #include +/** + * struct ns_tree - Namespace tree + * @ns_tree: Rbtree of namespaces of a particular type + * @ns_list: Sequentially walkable list of all namespaces of this type + * @ns_tree_lock: Seqlock to protect the tree and list + * @type: type of namespaces in this tree + */ +struct ns_tree { + struct rb_root ns_tree; + struct list_head ns_list; + seqlock_t ns_tree_lock; + int type; +}; + struct ns_tree mnt_ns_tree = { .ns_tree = RB_ROOT, .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), -- cgit v1.2.3 From 4055526d35746ce8b04bfa5e14e14f28bb163186 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 24 Sep 2025 13:33:59 +0200 Subject: ns: move ns type into struct ns_common It's misplaced in struct proc_ns_operations and ns->ops might be NULL if the namespace is compiled out but we still want to know the type of the namespace for the initial namespace struct. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 6 +++--- fs/nsfs.c | 18 +++++++++--------- include/linux/ns_common.h | 30 +++++++++++++++++++++++++----- include/linux/proc_ns.h | 1 - init/version-timestamp.c | 1 + ipc/msgutil.c | 1 + ipc/namespace.c | 1 - kernel/cgroup/cgroup.c | 1 + kernel/cgroup/namespace.c | 1 - kernel/nscommon.c | 5 +++-- kernel/nsproxy.c | 4 ++-- kernel/nstree.c | 8 ++++---- kernel/pid.c | 1 + kernel/pid_namespace.c | 2 -- kernel/time/namespace.c | 3 +-- kernel/user.c | 1 + kernel/user_namespace.c | 1 - kernel/utsname.c | 1 - net/core/net_namespace.c | 1 - 19 files changed, 52 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index d65917ec5544..01334d5038a2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4927,7 +4927,7 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, return -EINVAL; ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; /* @@ -5830,7 +5830,7 @@ static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq return ERR_PTR(-EINVAL); ns = get_proc_ns(file_inode(fd_file(f))); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return ERR_PTR(-EINVAL); mnt_ns = to_mnt_ns(ns); @@ -6016,6 +6016,7 @@ struct mnt_namespace init_mnt_ns = { .ns.ops = &mntns_operations, .user_ns = &init_user_ns, .ns.__ns_ref = REFCOUNT_INIT(1), + .ns.ns_type = ns_common_type(&init_mnt_ns), .passive = REFCOUNT_INIT(1), .mounts = RB_ROOT, .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), @@ -6333,7 +6334,6 @@ static struct user_namespace *mntns_owner(struct ns_common *ns) const struct proc_ns_operations mntns_operations = { .name = "mnt", - .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, diff --git a/fs/nsfs.c b/fs/nsfs.c index dc0a4404b971..e7fd8a790aaa 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -219,9 +219,9 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return -EINVAL; return open_related_ns(ns, ns->ops->get_parent); case NS_GET_NSTYPE: - return ns->ops->type; + return ns->ns_type; case NS_GET_OWNER_UID: - if (ns->ops->type != CLONE_NEWUSER) + if (ns->ns_type != CLONE_NEWUSER) return -EINVAL; user_ns = container_of(ns, struct user_namespace, ns); argp = (uid_t __user *) arg; @@ -234,7 +234,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, case NS_GET_PID_IN_PIDNS: fallthrough; case NS_GET_TGID_IN_PIDNS: { - if (ns->ops->type != CLONE_NEWPID) + if (ns->ns_type != CLONE_NEWPID) return -EINVAL; ret = -ESRCH; @@ -273,7 +273,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, return ret; } case NS_GET_MNTNS_ID: - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; fallthrough; case NS_GET_ID: { @@ -293,7 +293,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (!uinfo) @@ -314,7 +314,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, struct file *f __free(fput) = NULL; size_t usize = _IOC_SIZE(ioctl); - if (ns->ops->type != CLONE_NEWNS) + if (ns->ns_type != CLONE_NEWNS) return -EINVAL; if (usize < MNT_NS_INFO_SIZE_VER0) @@ -453,7 +453,7 @@ static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, } fid->ns_id = ns->ns_id; - fid->ns_type = ns->ops->type; + fid->ns_type = ns->ns_type; fid->ns_inum = inode->i_ino; return FILEID_NSFS; } @@ -489,14 +489,14 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, return NULL; VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); - VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); if (!__ns_ref_get(ns)) return NULL; } - switch (ns->ops->type) { + switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: if (!current_in_namespace(to_cg_ns(ns))) diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 56492cd9ff8d..f5b68b8abb54 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -4,6 +4,7 @@ #include #include +#include struct proc_ns_operations; @@ -37,6 +38,7 @@ extern const struct proc_ns_operations timens_operations; extern const struct proc_ns_operations timens_for_children_operations; struct ns_common { + u32 ns_type; struct dentry *stashed; const struct proc_ns_operations *ops; unsigned int inum; @@ -51,7 +53,7 @@ struct ns_common { }; }; -int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum); +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); #define to_ns_common(__ns) \ @@ -106,10 +108,28 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) -#define ns_common_init(__ns) \ - __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) - -#define ns_common_init_inum(__ns, __inum) __ns_common_init(to_ns_common(__ns), to_ns_operations(__ns), __inum) +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) + +#define ns_common_init(__ns) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_inum(__ns) : 0)) + +#define ns_common_init_inum(__ns, __inum) \ + __ns_common_init(to_ns_common(__ns), \ + ns_common_type(__ns), \ + to_ns_operations(__ns), \ + __inum) #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 08016f6e0e6f..e81b8e596e4f 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -17,7 +17,6 @@ struct inode; struct proc_ns_operations { const char *name; const char *real_ns_name; - int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); int (*install)(struct nsset *nsset, struct ns_common *ns); diff --git a/init/version-timestamp.c b/init/version-timestamp.c index 376b7c856d4d..d071835121c2 100644 --- a/init/version-timestamp.c +++ b/init/version-timestamp.c @@ -8,6 +8,7 @@ #include struct uts_namespace init_uts_ns = { + .ns.ns_type = ns_common_type(&init_uts_ns), .ns.__ns_ref = REFCOUNT_INIT(2), .name = { .sysname = UTS_SYSNAME, diff --git a/ipc/msgutil.c b/ipc/msgutil.c index dca6c8ec8f5f..7a03f6d03de3 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -33,6 +33,7 @@ struct ipc_namespace init_ipc_ns = { #ifdef CONFIG_IPC_NS .ns.ops = &ipcns_operations, #endif + .ns.ns_type = ns_common_type(&init_ipc_ns), }; struct msg_msgseg { diff --git a/ipc/namespace.c b/ipc/namespace.c index d89dfd718d2b..76abac74a5c3 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -248,7 +248,6 @@ static struct user_namespace *ipcns_owner(struct ns_common *ns) const struct proc_ns_operations ipcns_operations = { .name = "ipc", - .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 245b43ff2fa4..9b75102e81cb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -224,6 +224,7 @@ struct cgroup_namespace init_cgroup_ns = { .ns.ops = &cgroupns_operations, .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, + .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 04c98338ac08..241ca05f07c8 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -137,7 +137,6 @@ static struct user_namespace *cgroupns_owner(struct ns_common *ns) const struct proc_ns_operations cgroupns_operations = { .name = "cgroup", - .type = CLONE_NEWCGROUP, .get = cgroupns_get, .put = cgroupns_put, .install = cgroupns_install, diff --git a/kernel/nscommon.c b/kernel/nscommon.c index 3cef89ddef41..92c9df1e8774 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -7,7 +7,7 @@ #ifdef CONFIG_DEBUG_VFS static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) { - switch (ns->ops->type) { + switch (ns->ns_type) { #ifdef CONFIG_CGROUPS case CLONE_NEWCGROUP: VFS_WARN_ON_ONCE(ops != &cgroupns_operations); @@ -52,12 +52,13 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) } #endif -int __ns_common_init(struct ns_common *ns, const struct proc_ns_operations *ops, int inum) +int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; + ns->ns_type = ns_type; RB_CLEAR_NODE(&ns->ns_tree_node); INIT_LIST_HEAD(&ns->ns_list_node); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 5f31fdff8a38..8d62449237b6 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -545,9 +545,9 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(fd_file(f))) { ns = get_proc_ns(file_inode(fd_file(f))); - if (flags && (ns->ops->type != flags)) + if (flags && (ns->ns_type != flags)) err = -EINVAL; - flags = ns->ops->type; + flags = ns->ns_type; } else if (!IS_ERR(pidfd_pid(fd_file(f)))) { err = check_setns_flags(flags); } else { diff --git a/kernel/nstree.c b/kernel/nstree.c index ecc88b013eff..b24a320a11a6 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -106,7 +106,7 @@ void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) write_seqlock(&ns_tree->ns_tree_lock); - VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); /* @@ -128,7 +128,7 @@ void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) { VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); - VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); write_seqlock(&ns_tree->ns_tree_lock); rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); @@ -197,7 +197,7 @@ struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) if (!node) return NULL; - VFS_WARN_ON_ONCE(node_to_ns(node)->ops->type != ns_type); + VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); return node_to_ns(node); } @@ -225,7 +225,7 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, if (list_is_head(list, &ns_tree->ns_list)) return ERR_PTR(-ENOENT); - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ops->type != ns_tree->type); + VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); return list_entry_rcu(list, struct ns_common, ns_list_node); } diff --git a/kernel/pid.c b/kernel/pid.c index 7e8c66e0bf67..0c2dcddb317a 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -85,6 +85,7 @@ struct pid_namespace init_pid_ns = { #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif + .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a262a3f19443..f5b222c8ac39 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -443,7 +443,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns) const struct proc_ns_operations pidns_operations = { .name = "pid", - .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, @@ -454,7 +453,6 @@ const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", - .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 9f26e61be044..530cf99c2212 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -462,7 +462,6 @@ out: const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -472,7 +471,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -480,6 +478,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { + .ns.ns_type = ns_common_type(&init_time_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = ns_init_inum(&init_time_ns), diff --git a/kernel/user.c b/kernel/user.c index b2a53674d506..0163665914c9 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -65,6 +65,7 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, + .ns.ns_type = ns_common_type(&init_user_ns), .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e1559e8a8a02..03cb63883d04 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1400,7 +1400,6 @@ static struct user_namespace *userns_owner(struct ns_common *ns) const struct proc_ns_operations userns_operations = { .name = "user", - .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, diff --git a/kernel/utsname.c b/kernel/utsname.c index 00001592ad13..a8cdc84648ee 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -146,7 +146,6 @@ static struct user_namespace *utsns_owner(struct ns_common *ns) const struct proc_ns_operations utsns_operations = { .name = "uts", - .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index bdea7d5fac56..dfe84bd35f98 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1543,7 +1543,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns) const struct proc_ns_operations netns_operations = { .name = "net", - .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, -- cgit v1.2.3 From 4ae8d9aa9f9dc7137ea5e564d79c5aa5af1bc45c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Sep 2025 23:02:41 +0200 Subject: sched/deadline: Fix dl_server getting stuck John found it was easy to hit lockup warnings when running locktorture on a 2 CPU VM, which he bisected down to: commit cccb45d7c429 ("sched/deadline: Less agressive dl_server handling"). While debugging it seems there is a chance where we end up with the dl_server dequeued, with dl_se->dl_server_active. This causes dl_server_start() to return without enqueueing the dl_server, thus it fails to run when RT tasks starve the cpu. When this happens, dl_server_timer() catches the '!dl_se->server_has_tasks(dl_se)' case, which then calls replenish_dl_entity() and dl_server_stopped() and finally return HRTIMER_NO_RESTART. This ends in no new timer and also no enqueue, leaving the dl_server 'dead', allowing starvation. What should have happened is for the bandwidth timer to start the zero-laxity timer, which in turn would enqueue the dl_server and cause dl_se->server_pick_task() to be called -- which will stop the dl_server if no fair tasks are observed for a whole period. IOW, it is totally irrelevant if there are fair tasks at the moment of bandwidth refresh. This removes all dl_se->server_has_tasks() users, so remove the whole thing. Fixes: cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling") Reported-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Tested-by: John Stultz --- include/linux/sched.h | 1 - kernel/sched/deadline.c | 12 +----------- kernel/sched/fair.c | 7 +------ kernel/sched/sched.h | 4 ---- 4 files changed, 2 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f8188b833350..f89313b150e6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -733,7 +733,6 @@ struct sched_dl_entity { * runnable task. */ struct rq *rq; - dl_server_has_tasks_f server_has_tasks; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f25301267e47..5a5080b3a670 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -875,7 +875,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_se->dl_defer && !dl_se->dl_defer_running && dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { - if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + if (!is_dl_boosted(dl_se)) { /* * Set dl_se->dl_defer_armed and dl_throttled variables to @@ -1152,8 +1152,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; -static bool dl_server_stopped(struct sched_dl_entity *dl_se); - static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) { struct rq *rq = rq_of_dl_se(dl_se); @@ -1171,12 +1169,6 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ if (!dl_se->dl_runtime) return HRTIMER_NORESTART; - if (!dl_se->server_has_tasks(dl_se)) { - replenish_dl_entity(dl_se); - dl_server_stopped(dl_se); - return HRTIMER_NORESTART; - } - if (dl_se->dl_defer_armed) { /* * First check if the server could consume runtime in background. @@ -1625,11 +1617,9 @@ static bool dl_server_stopped(struct sched_dl_entity *dl_se) } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) { dl_se->rq = rq; - dl_se->server_has_tasks = has_tasks; dl_se->server_pick_task = pick_task; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..8ce56a8d507f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8859,11 +8859,6 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru return pick_next_task_fair(rq, prev, NULL); } -static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) -{ - return !!dl_se->rq->cfs.nr_queued; -} - static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) { return pick_task_fair(dl_se->rq); @@ -8875,7 +8870,7 @@ void fair_server_init(struct rq *rq) init_dl_entity(dl_se); - dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); + dl_server_init(dl_se, rq, fair_server_pick_task); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be9745d104f7..f10d6277dca1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -365,9 +365,6 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * * dl_se::rq -- runqueue we belong to. * - * dl_se::server_has_tasks() -- used on bandwidth enforcement; we 'stop' the - * server when it runs out of tasks to run. - * * dl_se::server_pick() -- nested pick_next_task(); we yield the period if this * returns NULL. * @@ -383,7 +380,6 @@ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, - dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); extern void sched_init_dl_servers(void); -- cgit v1.2.3 From a3a70caf7906708bf9bbc80018752a6b36543808 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 17 Sep 2025 12:03:20 +0200 Subject: sched/deadline: Fix dl_server behaviour John reported undesirable behaviour with the dl_server since commit: cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling"). When starving fair tasks on purpose (starting spinning FIFO tasks), his fair workload, which often goes (briefly) idle, would delay fair invocations for a second, running one invocation per second was both unexpected and terribly slow. The reason this happens is that when dl_se->server_pick_task() returns NULL, indicating no runnable tasks, it would yield, pushing any later jobs out a whole period (1 second). Instead simply stop the server. This should restore behaviour in that a later wakeup (which restarts the server) will be able to continue running (subject to the CBS wakeup rules). Notably, this does not re-introduce the behaviour cccb45d7c4295 set out to solve, any start/stop cycle is naturally throttled by the timer period (no active cancel). Fixes: cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling") Reported-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Peter Zijlstra (Intel) Tested-by: John Stultz --- include/linux/sched.h | 1 - kernel/sched/deadline.c | 23 ++--------------------- kernel/sched/sched.h | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f89313b150e6..e4ce0a76831e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -706,7 +706,6 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; - unsigned int dl_server_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5a5080b3a670..72c1f72463c7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1571,10 +1571,8 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) { - dl_se->dl_server_idle = 0; + if (dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); - } } void dl_server_start(struct sched_dl_entity *dl_se) @@ -1602,20 +1600,6 @@ void dl_server_stop(struct sched_dl_entity *dl_se) dl_se->dl_server_active = 0; } -static bool dl_server_stopped(struct sched_dl_entity *dl_se) -{ - if (!dl_se->dl_server_active) - return true; - - if (dl_se->dl_server_idle) { - dl_server_stop(dl_se); - return true; - } - - dl_se->dl_server_idle = 1; - return false; -} - void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_pick_f pick_task) { @@ -2384,10 +2368,7 @@ again: if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - if (!dl_server_stopped(dl_se)) { - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); - } + dl_server_stop(dl_se); goto again; } rq->dl_server = dl_se; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f10d6277dca1..cf2109b67f9a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -371,10 +371,39 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * dl_server_update() -- called from update_curr_common(), propagates runtime * to the server. * - * dl_server_start() - * dl_server_stop() -- start/stop the server when it has (no) tasks. + * dl_server_start() -- start the server when it has tasks; it will stop + * automatically when there are no more tasks, per + * dl_se::server_pick() returning NULL. + * + * dl_server_stop() -- (force) stop the server; use when updating + * parameters. * * dl_server_init() -- initializes the server. + * + * When started the dl_server will (per dl_defer) schedule a timer for its + * zero-laxity point -- that is, unlike regular EDF tasks which run ASAP, a + * server will run at the very end of its period. + * + * This is done such that any runtime from the target class can be accounted + * against the server -- through dl_server_update() above -- such that when it + * becomes time to run, it might already be out of runtime and get deferred + * until the next period. In this case dl_server_timer() will alternate + * between defer and replenish but never actually enqueue the server. + * + * Only when the target class does not manage to exhaust the server's runtime + * (there's actualy starvation in the given period), will the dl_server get on + * the runqueue. Once queued it will pick tasks from the target class and run + * them until either its runtime is exhaused, at which point its back to + * dl_server_timer, or until there are no more tasks to run, at which point + * the dl_server stops itself. + * + * By stopping at this point the dl_server retains bandwidth, which, if a new + * task wakes up imminently (starting the server again), can be used -- + * subject to CBS wakeup rules -- without having to wait for the next period. + * + * Additionally, because of the dl_defer behaviour the start/stop behaviour is + * naturally thottled to once per period, avoiding high context switch + * workloads from spamming the hrtimer program/cancel paths. */ extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); -- cgit v1.2.3 From 88a90315a99a9120cd471bf681515cc77cd7cdb8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Sep 2025 14:09:14 +0800 Subject: rcu: Replace preempt.h with sched.h in include/linux/rcupdate.h In the next commit, we will move the definition of migrate_enable() and migrate_disable() to linux/sched.h. However, migrate_enable/migrate_disable will be used in commit 1b93c03fb319 ("rcu: add rcu_read_lock_dont_migrate()") in bpf-next tree. In order to fix potential compiling error, replace linux/preempt.h with linux/sched.h in include/linux/rcupdate.h. Signed-off-by: Menglong Dong Signed-off-by: Peter Zijlstra (Intel) --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb..8f346c847ee5 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 378b7708194fff77c9020392067329931c3fcc04 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Sep 2025 14:09:15 +0800 Subject: sched: Make migrate_{en,dis}able() inline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For now, migrate_enable and migrate_disable are global, which makes them become hotspots in some case. Take BPF for example, the function calling to migrate_enable and migrate_disable in BPF trampoline can introduce significant overhead, and following is the 'perf top' of FENTRY's benchmark (./tools/testing/selftests/bpf/bench trig-fentry): 54.63% bpf_prog_2dcccf652aac1793_bench_trigger_fentry [k] bpf_prog_2dcccf652aac1793_bench_trigger_fentry 10.43% [kernel] [k] migrate_enable 10.07% bpf_trampoline_6442517037 [k] bpf_trampoline_6442517037 8.06% [kernel] [k] __bpf_prog_exit_recur 4.11% libc.so.6 [.] syscall 2.15% [kernel] [k] entry_SYSCALL_64 1.48% [kernel] [k] memchr_inv 1.32% [kernel] [k] fput 1.16% [kernel] [k] _copy_to_user 0.73% [kernel] [k] bpf_prog_test_run_raw_tp So in this commit, we make migrate_enable/migrate_disable inline to obtain better performance. The struct rq is defined internally in kernel/sched/sched.h, and the field "nr_pinned" is accessed in migrate_enable/migrate_disable, which makes it hard to make them inline. Alexei Starovoitov suggests to generate the offset of "nr_pinned" in [1], so we can define the migrate_enable/migrate_disable in include/linux/sched.h and access "this_rq()->nr_pinned" with "(void *)this_rq() + RQ_nr_pinned". The offset of "nr_pinned" is generated in include/generated/rq-offsets.h by kernel/sched/rq-offsets.c. Generally speaking, we move the definition of migrate_enable and migrate_disable to include/linux/sched.h from kernel/sched/core.c. The calling to __set_cpus_allowed_ptr() is leaved in ___migrate_enable(). The "struct rq" is not available in include/linux/sched.h, so we can't access the "runqueues" with this_cpu_ptr(), as the compilation will fail in this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): typeof((ptr) + 0) So we introduce the this_rq_raw() and access the runqueues with arch_raw_cpu_ptr/PERCPU_PTR directly. The variable "runqueues" is not visible in the kernel modules, and export it is not a good idea. As Peter Zijlstra advised in [2], we define and export migrate_enable/migrate_disable in kernel/sched/core.c too, and use them for the modules. Before this patch, the performance of BPF FENTRY is: fentry : 113.030 ± 0.149M/s fentry : 112.501 ± 0.187M/s fentry : 112.828 ± 0.267M/s fentry : 115.287 ± 0.241M/s After this patch, the performance of BPF FENTRY increases to: fentry : 143.644 ± 0.670M/s fentry : 149.764 ± 0.362M/s fentry : 149.642 ± 0.156M/s fentry : 145.263 ± 0.221M/s Signed-off-by: Menglong Dong Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/CAADnVQ+5sEDKHdsJY5ZsfGDO_1SEhhQWHrt2SMBG5SYyQ+jt7w@mail.gmail.com/ [1] Link: https://lore.kernel.org/all/20250819123214.GH4067720@noisy.programming.kicks-ass.net/ [2] --- Kbuild | 13 +++++- include/linux/preempt.h | 3 -- include/linux/sched.h | 113 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 1 + kernel/sched/core.c | 63 ++++++-------------------- kernel/sched/rq-offsets.c | 12 +++++ 6 files changed, 152 insertions(+), 53 deletions(-) create mode 100644 kernel/sched/rq-offsets.c (limited to 'include/linux') diff --git a/Kbuild b/Kbuild index f327ca86990c..13324b4bbe23 100644 --- a/Kbuild +++ b/Kbuild @@ -34,13 +34,24 @@ arch/$(SRCARCH)/kernel/asm-offsets.s: $(timeconst-file) $(bounds-file) $(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE $(call filechk,offsets,__ASM_OFFSETS_H__) +# Generate rq-offsets.h + +rq-offsets-file := include/generated/rq-offsets.h + +targets += kernel/sched/rq-offsets.s + +kernel/sched/rq-offsets.s: $(offsets-file) + +$(rq-offsets-file): kernel/sched/rq-offsets.s FORCE + $(call filechk,offsets,__RQ_OFFSETS_H__) + # Check for missing system calls quiet_cmd_syscalls = CALL $< cmd_syscalls = $(CONFIG_SHELL) $< $(CC) $(c_flags) $(missing_syscalls_flags) PHONY += missing-syscalls -missing-syscalls: scripts/checksyscalls.sh $(offsets-file) +missing-syscalls: scripts/checksyscalls.sh $(rq-offsets-file) $(call cmd,syscalls) # Check the manual modification of atomic headers diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 1fad1c8a4c76..92237c319035 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -424,8 +424,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * work-conserving schedulers. * */ -extern void migrate_disable(void); -extern void migrate_enable(void); /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section @@ -471,7 +469,6 @@ static __always_inline void preempt_enable_nested(void) DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) -DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #ifdef CONFIG_PREEMPT_DYNAMIC diff --git a/include/linux/sched.h b/include/linux/sched.h index 644a01bdae70..d60ecaccdffc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -49,6 +49,9 @@ #include #include #include +#ifndef COMPILE_OFFSETS +#include +#endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -2317,4 +2320,114 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +#ifndef MODULE +#ifndef COMPILE_OFFSETS + +extern void ___migrate_enable(void); + +struct rq; +DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + +/* + * The "struct rq" is not available here, so we can't access the + * "runqueues" with this_cpu_ptr(), as the compilation will fail in + * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): + * typeof((ptr) + 0) + * + * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. + */ +#ifdef CONFIG_SMP +#define this_rq_raw() arch_raw_cpu_ptr(&runqueues) +#else +#define this_rq_raw() PERCPU_PTR(&runqueues) +#endif +#define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) + +static inline void __migrate_enable(void) +{ + struct task_struct *p = current; + +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + guard(preempt)(); + if (unlikely(p->cpus_ptr != &p->cpus_mask)) + ___migrate_enable(); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq_pinned()--; +} + +static inline void __migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif + p->migration_disabled++; + return; + } + + guard(preempt)(); + this_rq_pinned()++; + p->migration_disabled = 1; +} +#else /* !COMPILE_OFFSETS */ +static inline void __migrate_disable(void) { } +static inline void __migrate_enable(void) { } +#endif /* !COMPILE_OFFSETS */ + +/* + * So that it is possible to not export the runqueues variable, define and + * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use + * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will + * be defined in kernel/sched/core.c. + */ +#ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE +static inline void migrate_disable(void) +{ + __migrate_disable(); +} + +static inline void migrate_enable(void) +{ + __migrate_enable(); +} +#else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ + +#else /* MODULE */ +extern void migrate_disable(void); +extern void migrate_enable(void); +#endif /* MODULE */ + +DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) + #endif diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af6..de9078a9df3a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23855,6 +23855,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, BTF_SET_START(btf_id_deny) BTF_ID_UNUSED #ifdef CONFIG_SMP +BTF_ID(func, ___migrate_enable) BTF_ID(func, migrate_disable) BTF_ID(func, migrate_enable) #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index feb750aae71b..7f1e5cb94c53 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7,6 +7,8 @@ * Copyright (C) 1991-2002 Linus Torvalds * Copyright (C) 1998-2024 Ingo Molnar, Red Hat */ +#define INSTANTIATE_EXPORTED_MIGRATE_DISABLE +#include #include #include #include @@ -2381,28 +2383,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) __do_set_cpus_allowed(p, &ac); } -void migrate_disable(void) -{ - struct task_struct *p = current; - - if (p->migration_disabled) { -#ifdef CONFIG_DEBUG_PREEMPT - /* - *Warn about overflow half-way through the range. - */ - WARN_ON_ONCE((s16)p->migration_disabled < 0); -#endif - p->migration_disabled++; - return; - } - - guard(preempt)(); - this_rq()->nr_pinned++; - p->migration_disabled = 1; -} -EXPORT_SYMBOL_GPL(migrate_disable); - -void migrate_enable(void) +void ___migrate_enable(void) { struct task_struct *p = current; struct affinity_context ac = { @@ -2410,35 +2391,19 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; -#ifdef CONFIG_DEBUG_PREEMPT - /* - * Check both overflow from migrate_disable() and superfluous - * migrate_enable(). - */ - if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) - return; -#endif + __set_cpus_allowed_ptr(p, &ac); +} +EXPORT_SYMBOL_GPL(___migrate_enable); - if (p->migration_disabled > 1) { - p->migration_disabled--; - return; - } +void migrate_disable(void) +{ + __migrate_disable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); - /* - * Ensure stop_task runs either before or after this, and that - * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). - */ - guard(preempt)(); - if (p->cpus_ptr != &p->cpus_mask) - __set_cpus_allowed_ptr(p, &ac); - /* - * Mustn't clear migration_disabled() until cpus_ptr points back at the - * regular cpus_mask, otherwise things that race (eg. - * select_fallback_rq) get confused. - */ - barrier(); - p->migration_disabled = 0; - this_rq()->nr_pinned--; +void migrate_enable(void) +{ + __migrate_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); diff --git a/kernel/sched/rq-offsets.c b/kernel/sched/rq-offsets.c new file mode 100644 index 000000000000..a23747bbe25b --- /dev/null +++ b/kernel/sched/rq-offsets.c @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 +#define COMPILE_OFFSETS +#include +#include +#include "sched.h" + +int main(void) +{ + DEFINE(RQ_nr_pinned, offsetof(struct rq, nr_pinned)); + + return 0; +} -- cgit v1.2.3 From 45b7f780739a3145aeef24d2dfa02517a6c82ed6 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 17 Sep 2025 14:09:16 +0800 Subject: sched: Fix some typos in include/linux/preempt.h There are some typos in the comments of migrate in include/linux/preempt.h: elegible -> eligible it's -> its migirate_disable -> migrate_disable abritrary -> arbitrary Just fix them. Signed-off-by: Menglong Dong Signed-off-by: Peter Zijlstra (Intel) --- include/linux/preempt.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 92237c319035..102202185d7a 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -372,7 +372,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, /* * Migrate-Disable and why it is undesired. * - * When a preempted task becomes elegible to run under the ideal model (IOW it + * When a preempted task becomes eligible to run under the ideal model (IOW it * becomes one of the M highest priority tasks), it might still have to wait * for the preemptee's migrate_disable() section to complete. Thereby suffering * a reduction in bandwidth in the exact duration of the migrate_disable() @@ -387,7 +387,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * - a lower priority tasks; which under preempt_disable() could've instantly * migrated away when another CPU becomes available, is now constrained * by the ability to push the higher priority task away, which might itself be - * in a migrate_disable() section, reducing it's available bandwidth. + * in a migrate_disable() section, reducing its available bandwidth. * * IOW it trades latency / moves the interference term, but it stays in the * system, and as long as it remains unbounded, the system is not fully @@ -399,7 +399,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a * number of primitives into becoming preemptible, they would also allow * migration. This turns out to break a bunch of per-cpu usage. To this end, - * all these primitives employ migirate_disable() to restore this implicit + * all these primitives employ migrate_disable() to restore this implicit * assumption. * * This is a 'temporary' work-around at best. The correct solution is getting @@ -407,7 +407,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, * per-cpu locking or short preempt-disable regions. * * The end goal must be to get rid of migrate_disable(), alternatively we need - * a schedulability theory that does not depend on abritrary migration. + * a schedulability theory that does not depend on arbitrary migration. * * * Notes on the implementation. -- cgit v1.2.3 From 3271f19bf7b9df665549666d789b9f126b4420c7 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Sep 2025 10:59:06 +0200 Subject: net: gso: restore ids of outer ip headers correctly Currently, NETIF_F_TSO_MANGLEID indicates that the inner-most ID can be mangled. Outer IDs can always be mangled. Make GSO preserve outer IDs by default, with NETIF_F_TSO_MANGLEID allowing both inner and outer IDs to be mangled. This commit also modifies a few drivers that use SKB_GSO_FIXEDID directly. Signed-off-by: Richard Gobert Reviewed-by: Edward Cree # for sfc Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250923085908.4687-4-richardbgobert@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/segmentation-offloads.rst | 22 ++++++++++++++-------- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 8 ++++++-- drivers/net/ethernet/sfc/ef100_tx.c | 17 +++++++++++++---- include/linux/netdevice.h | 9 +++++++-- include/linux/skbuff.h | 8 +++++++- net/core/dev.c | 10 ++++++++-- net/ipv4/af_inet.c | 13 ++++++------- net/ipv4/tcp_offload.c | 6 ++---- 8 files changed, 63 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/segmentation-offloads.rst b/Documentation/networking/segmentation-offloads.rst index 085e8fab03fd..72f69b22b28c 100644 --- a/Documentation/networking/segmentation-offloads.rst +++ b/Documentation/networking/segmentation-offloads.rst @@ -43,10 +43,19 @@ also point to the TCP header of the packet. For IPv4 segmentation we support one of two types in terms of the IP ID. The default behavior is to increment the IP ID with every segment. If the GSO type SKB_GSO_TCP_FIXEDID is specified then we will not increment the IP -ID and all segments will use the same IP ID. If a device has -NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when performing TSO -and we will either increment the IP ID for all frames, or leave it at a -static value based on driver preference. +ID and all segments will use the same IP ID. + +For encapsulated packets, SKB_GSO_TCP_FIXEDID refers only to the outer header. +SKB_GSO_TCP_FIXEDID_INNER can be used to specify the same for the inner header. +Any combination of these two GSO types is allowed. + +If a device has NETIF_F_TSO_MANGLEID set then the IP ID can be ignored when +performing TSO and we will either increment the IP ID for all frames, or leave +it at a static value based on driver preference. For encapsulated packets, +NETIF_F_TSO_MANGLEID is relevant for both outer and inner headers, unless the +DF bit is not set on the outer header, in which case the device driver must +guarantee that the IP ID field is incremented in the outer header with every +segment. UDP Fragmentation Offload @@ -124,10 +133,7 @@ Generic Receive Offload Generic receive offload is the complement to GSO. Ideally any frame assembled by GRO should be segmented to create an identical sequence of frames using GSO, and any sequence of frames segmented by GSO should be -able to be reassembled back to the original by GRO. The only exception to -this is IPv4 ID in the case that the DF bit is set for a given IP header. -If the value of the IPv4 ID is not sequentially incrementing it will be -altered so that it is when a frame assembled via GRO is segmented via GSO. +able to be reassembled back to the original by GRO. Partial Generic Segmentation Offload diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 4ed43ee9aa35..263d5628ee44 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1290,8 +1290,12 @@ static void mlx5e_shampo_update_ipv4_tcp_hdr(struct mlx5e_rq *rq, struct iphdr * tcp->check = ~tcp_v4_check(skb->len - tcp_off, ipv4->saddr, ipv4->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; - if (ntohs(ipv4->id) == rq->hw_gro_data->second_ip_id) - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; + if (ntohs(ipv4->id) == rq->hw_gro_data->second_ip_id) { + bool encap = rq->hw_gro_data->fk.control.flags & FLOW_DIS_ENCAPSULATION; + + skb_shinfo(skb)->gso_type |= encap ? SKB_GSO_TCP_FIXEDID_INNER : + SKB_GSO_TCP_FIXEDID; + } skb->csum_start = (unsigned char *)tcp - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); diff --git a/drivers/net/ethernet/sfc/ef100_tx.c b/drivers/net/ethernet/sfc/ef100_tx.c index e6b6be549581..03005757c060 100644 --- a/drivers/net/ethernet/sfc/ef100_tx.c +++ b/drivers/net/ethernet/sfc/ef100_tx.c @@ -189,6 +189,7 @@ static void ef100_make_tso_desc(struct efx_nic *efx, { bool gso_partial = skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL; unsigned int len, ip_offset, tcp_offset, payload_segs; + u32 mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_INC_MOD16; u32 mangleid = ESE_GZ_TX_DESC_IP4_ID_INC_MOD16; unsigned int outer_ip_offset, outer_l4_offset; u16 vlan_tci = skb_vlan_tag_get(skb); @@ -200,8 +201,17 @@ static void ef100_make_tso_desc(struct efx_nic *efx, bool outer_csum; u32 paylen; - if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) - mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + if (encap) { + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID_INNER) + mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) + mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + } else { + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID) + mangleid = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + mangleid_outer = ESE_GZ_TX_DESC_IP4_ID_NO_OP; + } + if (efx->net_dev->features & NETIF_F_HW_VLAN_CTAG_TX) vlan_enable = skb_vlan_tag_present(skb); @@ -245,8 +255,7 @@ static void ef100_make_tso_desc(struct efx_nic *efx, ESF_GZ_TX_TSO_OUTER_L4_OFF_W, outer_l4_offset >> 1, ESF_GZ_TX_TSO_ED_OUTER_UDP_LEN, udp_encap && !gso_partial, ESF_GZ_TX_TSO_ED_OUTER_IP_LEN, encap && !gso_partial, - ESF_GZ_TX_TSO_ED_OUTER_IP4_ID, encap ? mangleid : - ESE_GZ_TX_DESC_IP4_ID_NO_OP, + ESF_GZ_TX_TSO_ED_OUTER_IP4_ID, mangleid_outer, ESF_GZ_TX_TSO_VLAN_INSERT_EN, vlan_enable, ESF_GZ_TX_TSO_VLAN_INSERT_TCI, vlan_tci ); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1c54d44805fa..1b85454116f6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5320,13 +5320,18 @@ void skb_warn_bad_offload(const struct sk_buff *skb); static inline bool net_gso_ok(netdev_features_t features, int gso_type) { - netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT; + netdev_features_t feature; + + if (gso_type & (SKB_GSO_TCP_FIXEDID | SKB_GSO_TCP_FIXEDID_INNER)) + gso_type |= __SKB_GSO_TCP_FIXEDID; + + feature = ((netdev_features_t)gso_type << NETIF_F_GSO_SHIFT) & NETIF_F_GSO_MASK; /* check flags correspondence */ BUILD_BUG_ON(SKB_GSO_TCPV4 != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_DODGY != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT)); - BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(__SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TCPV6 != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_FCOE != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_GRE != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT)); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 78ecfa7d00d0..fb3fec9affaa 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -674,7 +674,7 @@ enum { /* This indicates the tcp segment has CWR set. */ SKB_GSO_TCP_ECN = 1 << 2, - SKB_GSO_TCP_FIXEDID = 1 << 3, + __SKB_GSO_TCP_FIXEDID = 1 << 3, SKB_GSO_TCPV6 = 1 << 4, @@ -707,6 +707,12 @@ enum { SKB_GSO_FRAGLIST = 1 << 18, SKB_GSO_TCP_ACCECN = 1 << 19, + + /* These indirectly map onto the same netdev feature. + * If NETIF_F_TSO_MANGLEID is set it may mangle both inner and outer IDs. + */ + SKB_GSO_TCP_FIXEDID = 1 << 30, + SKB_GSO_TCP_FIXEDID_INNER = 1 << 31, }; #if BITS_PER_LONG > 32 diff --git a/net/core/dev.c b/net/core/dev.c index fc4993526ead..8b54fdf0289a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3768,8 +3768,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; - /* Make sure to clear the IPv4 ID mangling feature if the - * IPv4 header has the potential to be fragmented. + /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header + * has the potential to be fragmented so that TSO does not generate + * segments with the same ID. For encapsulated packets, the ID mangling + * feature is guaranteed not to use the same ID for the outer IPv4 + * headers of the generated segments if the headers have the potential + * to be fragmented, so there is no need to clear the IPv4 ID mangling + * feature (see the section about NETIF_F_TSO_MANGLEID in + * segmentation-offloads.rst). */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index e298dacb4a06..804c51296c55 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1395,14 +1395,13 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - if (!skb->encapsulation || encap) { - udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); - fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); + /* fixed ID is invalid if DF bit is not set */ + fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap)); + if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) + goto out; - /* fixed ID is invalid if DF bit is not set */ - if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto out; - } + if (!skb->encapsulation || encap) + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 1949eede9ec9..2cb93da93abc 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -471,7 +471,6 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct tcphdr *th = tcp_hdr(skb); - bool is_fixedid; if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; @@ -485,10 +484,9 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); - is_fixedid = (NAPI_GRO_CB(skb)->ip_fixedid >> skb->encapsulation) & 1; - + BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID << 1 != SKB_GSO_TCP_FIXEDID_INNER); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | - (is_fixedid * SKB_GSO_TCP_FIXEDID); + (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); tcp_gro_complete(skb); return 0; -- cgit v1.2.3 From 60ac65a31041b0e5dfd736a79027314b9d533ef5 Mon Sep 17 00:00:00 2001 From: Sung-Chi Li Date: Thu, 11 Sep 2025 06:56:34 +0000 Subject: platform/chrome: update pwm fan control host commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update cros_ec_commands.h to include definitions for getting PWM fan duty, getting and setting the fan control mode. Signed-off-by: Sung-Chi Li Acked-by: Tzung-Bi Shih Reviewed-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250911-cros_ec_fan-v6-1-a1446cc098af@google.com Signed-off-by: Guenter Roeck --- include/linux/platform_data/cros_ec_commands.h | 29 +++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index c19b404e3d8d..69294f79cc88 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -1825,6 +1825,16 @@ struct ec_response_pwm_get_duty { uint16_t duty; /* Duty cycle, EC_PWM_MAX_DUTY = 100% */ } __ec_align2; +#define EC_CMD_PWM_GET_FAN_DUTY 0x0027 + +struct ec_params_pwm_get_fan_duty { + uint8_t fan_idx; +} __ec_align1; + +struct ec_response_pwm_get_fan_duty { + uint32_t percent; /* Percentage of duty cycle, ranging from 0 ~ 100 */ +} __ec_align4; + /*****************************************************************************/ /* * Lightbar commands. This looks worse than it is. Since we only use one HOST @@ -3127,14 +3137,31 @@ struct ec_params_thermal_set_threshold_v1 { /****************************************************************************/ -/* Toggle automatic fan control */ +/* Set or get fan control mode */ #define EC_CMD_THERMAL_AUTO_FAN_CTRL 0x0052 +enum ec_auto_fan_ctrl_cmd { + EC_AUTO_FAN_CONTROL_CMD_SET = 0, + EC_AUTO_FAN_CONTROL_CMD_GET, +}; + /* Version 1 of input params */ struct ec_params_auto_fan_ctrl_v1 { uint8_t fan_idx; } __ec_align1; +/* Version 2 of input params */ +struct ec_params_auto_fan_ctrl_v2 { + uint8_t fan_idx; + uint8_t cmd; /* enum ec_auto_fan_ctrl_cmd */ + uint8_t set_auto; /* only used with EC_AUTO_FAN_CONTROL_CMD_SET - bool + */ +} __ec_align4; + +struct ec_response_auto_fan_control { + uint8_t is_auto; /* bool */ +} __ec_align1; + /* Get/Set TMP006 calibration data */ #define EC_CMD_TMP006_GET_CALIBRATION 0x0053 #define EC_CMD_TMP006_SET_CALIBRATION 0x0054 -- cgit v1.2.3 From 5ba9f520f41a33c99fd5d1eb81b5650ed3517b88 Mon Sep 17 00:00:00 2001 From: Rahul Pathak Date: Mon, 18 Aug 2025 09:39:06 +0530 Subject: clk: Add clock driver for the RISC-V RPMI clock service group The RPMI specification defines a clock service group which can be accessed via SBI MPXY extension or dedicated S-mode RPMI transport. Add mailbox client based clock driver for the RISC-V RPMI clock service group. Reviewed-by: Stephen Boyd Reviewed-by: Andy Shevchenko Co-developed-by: Anup Patel Signed-off-by: Anup Patel Signed-off-by: Rahul Pathak Link: https://lore.kernel.org/r/20250818040920.272664-11-apatel@ventanamicro.com [pjw@kernel.org: converted rpmi_clkrate_u64 macro to a function; replaced bare constant with a macro] Signed-off-by: Paul Walmsley --- drivers/clk/Kconfig | 8 + drivers/clk/Makefile | 1 + drivers/clk/clk-rpmi.c | 620 +++++++++++++++++++++++++++++ include/linux/mailbox/riscv-rpmi-message.h | 16 + 4 files changed, 645 insertions(+) create mode 100644 drivers/clk/clk-rpmi.c (limited to 'include/linux') diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig index 4d56475f94fc..4d51fa589c1e 100644 --- a/drivers/clk/Kconfig +++ b/drivers/clk/Kconfig @@ -501,6 +501,14 @@ config COMMON_CLK_SP7021 Not all features of the PLL are currently supported by the driver. +config COMMON_CLK_RPMI + tristate "Clock driver based on RISC-V RPMI" + depends on MAILBOX + default RISCV + help + Support for clocks based on the clock service group defined by + the RISC-V platform management interface (RPMI) specification. + source "drivers/clk/actions/Kconfig" source "drivers/clk/analogbits/Kconfig" source "drivers/clk/baikal-t1/Kconfig" diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile index 18ed29cfdc11..b74a1767ca27 100644 --- a/drivers/clk/Makefile +++ b/drivers/clk/Makefile @@ -86,6 +86,7 @@ obj-$(CONFIG_COMMON_CLK_PWM) += clk-pwm.o obj-$(CONFIG_CLK_QORIQ) += clk-qoriq.o obj-$(CONFIG_COMMON_CLK_RK808) += clk-rk808.o obj-$(CONFIG_COMMON_CLK_RP1) += clk-rp1.o +obj-$(CONFIG_COMMON_CLK_RPMI) += clk-rpmi.o obj-$(CONFIG_COMMON_CLK_HI655X) += clk-hi655x.o obj-$(CONFIG_COMMON_CLK_S2MPS11) += clk-s2mps11.o obj-$(CONFIG_COMMON_CLK_SCMI) += clk-scmi.o diff --git a/drivers/clk/clk-rpmi.c b/drivers/clk/clk-rpmi.c new file mode 100644 index 000000000000..921296aafa68 --- /dev/null +++ b/drivers/clk/clk-rpmi.c @@ -0,0 +1,620 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V MPXY Based Clock Driver + * + * Copyright (C) 2025 Ventana Micro Systems Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RPMI_CLK_DISCRETE_MAX_NUM_RATES 16 +#define RPMI_CLK_NAME_LEN 16 + +#define to_rpmi_clk(clk) container_of(clk, struct rpmi_clk, hw) + +enum rpmi_clk_config { + RPMI_CLK_DISABLE = 0, + RPMI_CLK_ENABLE = 1, + RPMI_CLK_CONFIG_MAX_IDX +}; + +#define RPMI_CLK_TYPE_MASK GENMASK(1, 0) +enum rpmi_clk_type { + RPMI_CLK_DISCRETE = 0, + RPMI_CLK_LINEAR = 1, + RPMI_CLK_TYPE_MAX_IDX +}; + +struct rpmi_clk_context { + struct device *dev; + struct mbox_chan *chan; + struct mbox_client client; + u32 max_msg_data_size; +}; + +/* + * rpmi_clk_rates represents the rates format + * as specified by the RPMI specification. + * No other data format (e.g., struct linear_range) + * is required to avoid to and from conversion. + */ +union rpmi_clk_rates { + u64 discrete[RPMI_CLK_DISCRETE_MAX_NUM_RATES]; + struct { + u64 min; + u64 max; + u64 step; + } linear; +}; + +struct rpmi_clk { + struct rpmi_clk_context *context; + u32 id; + u32 num_rates; + u32 transition_latency; + enum rpmi_clk_type type; + union rpmi_clk_rates *rates; + char name[RPMI_CLK_NAME_LEN]; + struct clk_hw hw; +}; + +struct rpmi_clk_rate_discrete { + __le32 lo; + __le32 hi; +}; + +struct rpmi_clk_rate_linear { + __le32 min_lo; + __le32 min_hi; + __le32 max_lo; + __le32 max_hi; + __le32 step_lo; + __le32 step_hi; +}; + +struct rpmi_get_num_clocks_rx { + __le32 status; + __le32 num_clocks; +}; + +struct rpmi_get_attrs_tx { + __le32 clkid; +}; + +struct rpmi_get_attrs_rx { + __le32 status; + __le32 flags; + __le32 num_rates; + __le32 transition_latency; + char name[RPMI_CLK_NAME_LEN]; +}; + +struct rpmi_get_supp_rates_tx { + __le32 clkid; + __le32 clk_rate_idx; +}; + +struct rpmi_get_supp_rates_rx { + __le32 status; + __le32 flags; + __le32 remaining; + __le32 returned; + __le32 rates[]; +}; + +struct rpmi_get_rate_tx { + __le32 clkid; +}; + +struct rpmi_get_rate_rx { + __le32 status; + __le32 lo; + __le32 hi; +}; + +struct rpmi_set_rate_tx { + __le32 clkid; + __le32 flags; + __le32 lo; + __le32 hi; +}; + +struct rpmi_set_rate_rx { + __le32 status; +}; + +struct rpmi_set_config_tx { + __le32 clkid; + __le32 config; +}; + +struct rpmi_set_config_rx { + __le32 status; +}; + +static inline u64 rpmi_clkrate_u64(u32 __hi, u32 __lo) +{ + return (((u64)(__hi) << 32) | (u32)(__lo)); +} + +static u32 rpmi_clk_get_num_clocks(struct rpmi_clk_context *context) +{ + struct rpmi_get_num_clocks_rx rx, *resp; + struct rpmi_mbox_message msg; + int ret; + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_GET_NUM_CLOCKS, + NULL, 0, &rx, sizeof(rx)); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return 0; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp || resp->status) + return 0; + + return le32_to_cpu(resp->num_clocks); +} + +static int rpmi_clk_get_attrs(u32 clkid, struct rpmi_clk *rpmi_clk) +{ + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_mbox_message msg; + struct rpmi_get_attrs_tx tx; + struct rpmi_get_attrs_rx rx, *resp; + u8 format; + int ret; + + tx.clkid = cpu_to_le32(clkid); + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_GET_ATTRIBUTES, + &tx, sizeof(tx), &rx, sizeof(rx)); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + + rpmi_clk->id = clkid; + rpmi_clk->num_rates = le32_to_cpu(resp->num_rates); + rpmi_clk->transition_latency = le32_to_cpu(resp->transition_latency); + strscpy(rpmi_clk->name, resp->name, RPMI_CLK_NAME_LEN); + + format = le32_to_cpu(resp->flags) & RPMI_CLK_TYPE_MASK; + if (format >= RPMI_CLK_TYPE_MAX_IDX) + return -EINVAL; + + rpmi_clk->type = format; + + return 0; +} + +static int rpmi_clk_get_supported_rates(u32 clkid, struct rpmi_clk *rpmi_clk) +{ + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_clk_rate_discrete *rate_discrete; + struct rpmi_clk_rate_linear *rate_linear; + struct rpmi_get_supp_rates_tx tx; + struct rpmi_get_supp_rates_rx *resp; + struct rpmi_mbox_message msg; + size_t clk_rate_idx; + int ret, rateidx, j; + + tx.clkid = cpu_to_le32(clkid); + tx.clk_rate_idx = 0; + + /* + * Make sure we allocate rx buffer sufficient to be accommodate all + * the rates sent in one RPMI message. + */ + struct rpmi_get_supp_rates_rx *rx __free(kfree) = + kzalloc(context->max_msg_data_size, GFP_KERNEL); + if (!rx) + return -ENOMEM; + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_GET_SUPPORTED_RATES, + &tx, sizeof(tx), rx, context->max_msg_data_size); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + if (!le32_to_cpu(resp->returned)) + return -EINVAL; + + if (rpmi_clk->type == RPMI_CLK_DISCRETE) { + rate_discrete = (struct rpmi_clk_rate_discrete *)resp->rates; + + for (rateidx = 0; rateidx < le32_to_cpu(resp->returned); rateidx++) { + rpmi_clk->rates->discrete[rateidx] = + rpmi_clkrate_u64(le32_to_cpu(rate_discrete[rateidx].hi), + le32_to_cpu(rate_discrete[rateidx].lo)); + } + + /* + * Keep sending the request message until all + * the rates are received. + */ + clk_rate_idx = 0; + while (le32_to_cpu(resp->remaining)) { + clk_rate_idx += le32_to_cpu(resp->returned); + tx.clk_rate_idx = cpu_to_le32(clk_rate_idx); + + rpmi_mbox_init_send_with_response(&msg, + RPMI_CLK_SRV_GET_SUPPORTED_RATES, + &tx, sizeof(tx), + rx, context->max_msg_data_size); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + if (!le32_to_cpu(resp->returned)) + return -EINVAL; + + for (j = 0; j < le32_to_cpu(resp->returned); j++) { + if (rateidx >= clk_rate_idx + le32_to_cpu(resp->returned)) + break; + rpmi_clk->rates->discrete[rateidx++] = + rpmi_clkrate_u64(le32_to_cpu(rate_discrete[j].hi), + le32_to_cpu(rate_discrete[j].lo)); + } + } + } else if (rpmi_clk->type == RPMI_CLK_LINEAR) { + rate_linear = (struct rpmi_clk_rate_linear *)resp->rates; + + rpmi_clk->rates->linear.min = rpmi_clkrate_u64(le32_to_cpu(rate_linear->min_hi), + le32_to_cpu(rate_linear->min_lo)); + rpmi_clk->rates->linear.max = rpmi_clkrate_u64(le32_to_cpu(rate_linear->max_hi), + le32_to_cpu(rate_linear->max_lo)); + rpmi_clk->rates->linear.step = rpmi_clkrate_u64(le32_to_cpu(rate_linear->step_hi), + le32_to_cpu(rate_linear->step_lo)); + } + + return 0; +} + +static unsigned long rpmi_clk_recalc_rate(struct clk_hw *hw, + unsigned long parent_rate) +{ + struct rpmi_clk *rpmi_clk = to_rpmi_clk(hw); + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_mbox_message msg; + struct rpmi_get_rate_tx tx; + struct rpmi_get_rate_rx rx, *resp; + int ret; + + tx.clkid = cpu_to_le32(rpmi_clk->id); + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_GET_RATE, + &tx, sizeof(tx), &rx, sizeof(rx)); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + + return rpmi_clkrate_u64(le32_to_cpu(resp->hi), le32_to_cpu(resp->lo)); +} + +static int rpmi_clk_determine_rate(struct clk_hw *hw, + struct clk_rate_request *req) +{ + struct rpmi_clk *rpmi_clk = to_rpmi_clk(hw); + u64 fmin, fmax, ftmp; + + /* + * Keep the requested rate if the clock format + * is of discrete type. Let the platform which + * is actually controlling the clock handle that. + */ + if (rpmi_clk->type == RPMI_CLK_DISCRETE) + return 0; + + fmin = rpmi_clk->rates->linear.min; + fmax = rpmi_clk->rates->linear.max; + + if (req->rate <= fmin) { + req->rate = fmin; + return 0; + } else if (req->rate >= fmax) { + req->rate = fmax; + return 0; + } + + ftmp = req->rate - fmin; + ftmp += rpmi_clk->rates->linear.step - 1; + do_div(ftmp, rpmi_clk->rates->linear.step); + + req->rate = ftmp * rpmi_clk->rates->linear.step + fmin; + + return 0; +} + +static int rpmi_clk_set_rate(struct clk_hw *hw, unsigned long rate, + unsigned long parent_rate) +{ + struct rpmi_clk *rpmi_clk = to_rpmi_clk(hw); + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_mbox_message msg; + struct rpmi_set_rate_tx tx; + struct rpmi_set_rate_rx rx, *resp; + int ret; + + tx.clkid = cpu_to_le32(rpmi_clk->id); + tx.lo = cpu_to_le32(lower_32_bits(rate)); + tx.hi = cpu_to_le32(upper_32_bits(rate)); + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_SET_RATE, + &tx, sizeof(tx), &rx, sizeof(rx)); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + + return 0; +} + +static int rpmi_clk_enable(struct clk_hw *hw) +{ + struct rpmi_clk *rpmi_clk = to_rpmi_clk(hw); + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_mbox_message msg; + struct rpmi_set_config_tx tx; + struct rpmi_set_config_rx rx, *resp; + int ret; + + tx.config = cpu_to_le32(RPMI_CLK_ENABLE); + tx.clkid = cpu_to_le32(rpmi_clk->id); + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_SET_CONFIG, + &tx, sizeof(tx), &rx, sizeof(rx)); + + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return ret; + + resp = rpmi_mbox_get_msg_response(&msg); + if (!resp) + return -EINVAL; + if (resp->status) + return rpmi_to_linux_error(le32_to_cpu(resp->status)); + + return 0; +} + +static void rpmi_clk_disable(struct clk_hw *hw) +{ + struct rpmi_clk *rpmi_clk = to_rpmi_clk(hw); + struct rpmi_clk_context *context = rpmi_clk->context; + struct rpmi_mbox_message msg; + struct rpmi_set_config_tx tx; + struct rpmi_set_config_rx rx; + + tx.config = cpu_to_le32(RPMI_CLK_DISABLE); + tx.clkid = cpu_to_le32(rpmi_clk->id); + + rpmi_mbox_init_send_with_response(&msg, RPMI_CLK_SRV_SET_CONFIG, + &tx, sizeof(tx), &rx, sizeof(rx)); + + rpmi_mbox_send_message(context->chan, &msg); +} + +static const struct clk_ops rpmi_clk_ops = { + .recalc_rate = rpmi_clk_recalc_rate, + .determine_rate = rpmi_clk_determine_rate, + .set_rate = rpmi_clk_set_rate, + .prepare = rpmi_clk_enable, + .unprepare = rpmi_clk_disable, +}; + +static struct clk_hw *rpmi_clk_enumerate(struct rpmi_clk_context *context, u32 clkid) +{ + struct device *dev = context->dev; + unsigned long min_rate, max_rate; + union rpmi_clk_rates *rates; + struct rpmi_clk *rpmi_clk; + struct clk_init_data init = {}; + struct clk_hw *clk_hw; + int ret; + + rates = devm_kzalloc(dev, sizeof(*rates), GFP_KERNEL); + if (!rates) + return ERR_PTR(-ENOMEM); + + rpmi_clk = devm_kzalloc(dev, sizeof(*rpmi_clk), GFP_KERNEL); + if (!rpmi_clk) + return ERR_PTR(-ENOMEM); + + rpmi_clk->context = context; + rpmi_clk->rates = rates; + + ret = rpmi_clk_get_attrs(clkid, rpmi_clk); + if (ret) + return dev_err_ptr_probe(dev, ret, + "Failed to get clk-%u attributes\n", + clkid); + + ret = rpmi_clk_get_supported_rates(clkid, rpmi_clk); + if (ret) + return dev_err_ptr_probe(dev, ret, + "Get supported rates failed for clk-%u\n", + clkid); + + init.flags = CLK_GET_RATE_NOCACHE; + init.num_parents = 0; + init.ops = &rpmi_clk_ops; + init.name = rpmi_clk->name; + clk_hw = &rpmi_clk->hw; + clk_hw->init = &init; + + ret = devm_clk_hw_register(dev, clk_hw); + if (ret) + return dev_err_ptr_probe(dev, ret, + "Unable to register clk-%u\n", + clkid); + + if (rpmi_clk->type == RPMI_CLK_DISCRETE) { + min_rate = rpmi_clk->rates->discrete[0]; + max_rate = rpmi_clk->rates->discrete[rpmi_clk->num_rates - 1]; + } else { + min_rate = rpmi_clk->rates->linear.min; + max_rate = rpmi_clk->rates->linear.max; + } + + clk_hw_set_rate_range(clk_hw, min_rate, max_rate); + + return clk_hw; +} + +static void rpmi_clk_mbox_chan_release(void *data) +{ + struct mbox_chan *chan = data; + + mbox_free_channel(chan); +} + +static int rpmi_clk_probe(struct platform_device *pdev) +{ + int ret; + unsigned int num_clocks, i; + struct clk_hw_onecell_data *clk_data; + struct rpmi_clk_context *context; + struct rpmi_mbox_message msg; + struct clk_hw *hw_ptr; + struct device *dev = &pdev->dev; + + context = devm_kzalloc(dev, sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; + context->dev = dev; + platform_set_drvdata(pdev, context); + + context->client.dev = context->dev; + context->client.rx_callback = NULL; + context->client.tx_block = false; + context->client.knows_txdone = true; + context->client.tx_tout = 0; + + context->chan = mbox_request_channel(&context->client, 0); + if (IS_ERR(context->chan)) + return PTR_ERR(context->chan); + + ret = devm_add_action_or_reset(dev, rpmi_clk_mbox_chan_release, context->chan); + if (ret) + return dev_err_probe(dev, ret, "Failed to add rpmi mbox channel cleanup\n"); + + rpmi_mbox_init_get_attribute(&msg, RPMI_MBOX_ATTR_SPEC_VERSION); + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return dev_err_probe(dev, ret, "Failed to get spec version\n"); + if (msg.attr.value < RPMI_MKVER(1, 0)) { + return dev_err_probe(dev, -EINVAL, + "msg protocol version mismatch, expected 0x%x, found 0x%x\n", + RPMI_MKVER(1, 0), msg.attr.value); + } + + rpmi_mbox_init_get_attribute(&msg, RPMI_MBOX_ATTR_SERVICEGROUP_ID); + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return dev_err_probe(dev, ret, "Failed to get service group ID\n"); + if (msg.attr.value != RPMI_SRVGRP_CLOCK) { + return dev_err_probe(dev, -EINVAL, + "service group match failed, expected 0x%x, found 0x%x\n", + RPMI_SRVGRP_CLOCK, msg.attr.value); + } + + rpmi_mbox_init_get_attribute(&msg, RPMI_MBOX_ATTR_SERVICEGROUP_VERSION); + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return dev_err_probe(dev, ret, "Failed to get service group version\n"); + if (msg.attr.value < RPMI_MKVER(1, 0)) { + return dev_err_probe(dev, -EINVAL, + "service group version failed, expected 0x%x, found 0x%x\n", + RPMI_MKVER(1, 0), msg.attr.value); + } + + rpmi_mbox_init_get_attribute(&msg, RPMI_MBOX_ATTR_MAX_MSG_DATA_SIZE); + ret = rpmi_mbox_send_message(context->chan, &msg); + if (ret) + return dev_err_probe(dev, ret, "Failed to get max message data size\n"); + + context->max_msg_data_size = msg.attr.value; + num_clocks = rpmi_clk_get_num_clocks(context); + if (!num_clocks) + return dev_err_probe(dev, -ENODEV, "No clocks found\n"); + + clk_data = devm_kzalloc(dev, struct_size(clk_data, hws, num_clocks), + GFP_KERNEL); + if (!clk_data) + return dev_err_probe(dev, -ENOMEM, "No memory for clock data\n"); + clk_data->num = num_clocks; + + for (i = 0; i < clk_data->num; i++) { + hw_ptr = rpmi_clk_enumerate(context, i); + if (IS_ERR(hw_ptr)) { + return dev_err_probe(dev, PTR_ERR(hw_ptr), + "Failed to register clk-%d\n", i); + } + clk_data->hws[i] = hw_ptr; + } + + ret = devm_of_clk_add_hw_provider(dev, of_clk_hw_onecell_get, clk_data); + if (ret) + return dev_err_probe(dev, ret, "Failed to register clock HW provider\n"); + + return 0; +} + +static const struct of_device_id rpmi_clk_of_match[] = { + { .compatible = "riscv,rpmi-clock" }, + { } +}; +MODULE_DEVICE_TABLE(of, rpmi_clk_of_match); + +static struct platform_driver rpmi_clk_driver = { + .driver = { + .name = "riscv-rpmi-clock", + .of_match_table = rpmi_clk_of_match, + }, + .probe = rpmi_clk_probe, +}; +module_platform_driver(rpmi_clk_driver); + +MODULE_AUTHOR("Rahul Pathak "); +MODULE_DESCRIPTION("Clock Driver based on RPMI message protocol"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mailbox/riscv-rpmi-message.h b/include/linux/mailbox/riscv-rpmi-message.h index c3a98fc12c0a..8176d33747fe 100644 --- a/include/linux/mailbox/riscv-rpmi-message.h +++ b/include/linux/mailbox/riscv-rpmi-message.h @@ -90,6 +90,22 @@ static inline int rpmi_to_linux_error(int rpmi_error) } } +/* RPMI service group IDs */ +#define RPMI_SRVGRP_CLOCK 0x00008 + +/* RPMI clock service IDs */ +enum rpmi_clock_service_id { + RPMI_CLK_SRV_ENABLE_NOTIFICATION = 0x01, + RPMI_CLK_SRV_GET_NUM_CLOCKS = 0x02, + RPMI_CLK_SRV_GET_ATTRIBUTES = 0x03, + RPMI_CLK_SRV_GET_SUPPORTED_RATES = 0x04, + RPMI_CLK_SRV_SET_CONFIG = 0x05, + RPMI_CLK_SRV_GET_CONFIG = 0x06, + RPMI_CLK_SRV_SET_RATE = 0x07, + RPMI_CLK_SRV_GET_RATE = 0x08, + RPMI_CLK_SRV_ID_MAX_COUNT +}; + /* RPMI Linux mailbox attribute IDs */ enum rpmi_mbox_attribute_id { RPMI_MBOX_ATTR_SPEC_VERSION, -- cgit v1.2.3 From 495c8d35035edb66e3284113bef01f3b1b843832 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Thu, 25 Sep 2025 13:51:07 -0500 Subject: PM: hibernate: Add pm_hibernation_mode_is_suspend() Some drivers have different flows for hibernation and suspend. If the driver opportunistically will skip thaw() then it needs a hint to know what is happening after the hibernate. Introduce a new symbol pm_hibernation_mode_is_suspend() that drivers can call to determine if suspending the system for this purpose. Tested-by: Ionut Nechita Tested-by: Kenneth Crudup Acked-by: Alex Deucher Signed-off-by: Mario Limonciello (AMD) Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 2 ++ kernel/power/hibernate.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 317ae31e89b3..0664c685f0b2 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -276,6 +276,7 @@ extern void arch_suspend_enable_irqs(void); extern int pm_suspend(suspend_state_t state); extern bool sync_on_suspend_enabled; +bool pm_hibernation_mode_is_suspend(void); #else /* !CONFIG_SUSPEND */ #define suspend_valid_only_mem NULL @@ -288,6 +289,7 @@ static inline bool pm_suspend_via_firmware(void) { return false; } static inline bool pm_resume_via_firmware(void) { return false; } static inline bool pm_suspend_no_platform(void) { return false; } static inline bool pm_suspend_default_s2idle(void) { return false; } +static inline bool pm_hibernation_mode_is_suspend(void) { return false; } static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e0cdb8d9fd45..5d146218cae8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -80,6 +80,17 @@ static const struct platform_hibernation_ops *hibernation_ops; static atomic_t hibernate_atomic = ATOMIC_INIT(1); +#ifdef CONFIG_SUSPEND +/** + * pm_hibernation_mode_is_suspend - Check if hibernation has been set to suspend + */ +bool pm_hibernation_mode_is_suspend(void) +{ + return hibernation_mode == HIBERNATION_SUSPEND; +} +EXPORT_SYMBOL_GPL(pm_hibernation_mode_is_suspend); +#endif + bool hibernate_acquire(void) { return atomic_add_unless(&hibernate_atomic, -1, 0); -- cgit v1.2.3 From aa43953e862c031ff66e44353c88beb7a449e80d Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 18 Aug 2025 09:39:09 +0530 Subject: irqchip: Add driver for the RPMI system MSI service group The RPMI specification defines a system MSI service group which allows application processors to receive MSIs upon system events such as graceful shutdown/reboot request, CPU hotplug event, memory hotplug event, etc. Add an irqchip driver for the RISC-V RPMI system MSI service group to directly receive system MSIs in Linux kernel. Reviewed-by: Thomas Gleixner Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250818040920.272664-14-apatel@ventanamicro.com Signed-off-by: Paul Walmsley --- drivers/irqchip/Kconfig | 7 + drivers/irqchip/Makefile | 1 + drivers/irqchip/irq-riscv-rpmi-sysmsi.c | 287 +++++++++++++++++++++++++++++ include/linux/mailbox/riscv-rpmi-message.h | 13 ++ 4 files changed, 308 insertions(+) create mode 100644 drivers/irqchip/irq-riscv-rpmi-sysmsi.c (limited to 'include/linux') diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index 6d12c6ab9ea4..e047ba36df16 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -634,6 +634,13 @@ config RISCV_IMSIC select GENERIC_MSI_IRQ select IRQ_MSI_LIB +config RISCV_RPMI_SYSMSI + bool + depends on MAILBOX + select IRQ_DOMAIN_HIERARCHY + select GENERIC_MSI_IRQ + default RISCV + config SIFIVE_PLIC bool depends on RISCV diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile index 93e3ced023bb..3de083f5484c 100644 --- a/drivers/irqchip/Makefile +++ b/drivers/irqchip/Makefile @@ -106,6 +106,7 @@ obj-$(CONFIG_RISCV_INTC) += irq-riscv-intc.o obj-$(CONFIG_RISCV_APLIC) += irq-riscv-aplic-main.o irq-riscv-aplic-direct.o obj-$(CONFIG_RISCV_APLIC_MSI) += irq-riscv-aplic-msi.o obj-$(CONFIG_RISCV_IMSIC) += irq-riscv-imsic-state.o irq-riscv-imsic-early.o irq-riscv-imsic-platform.o +obj-$(CONFIG_RISCV_RPMI_SYSMSI) += irq-riscv-rpmi-sysmsi.o obj-$(CONFIG_SIFIVE_PLIC) += irq-sifive-plic.o obj-$(CONFIG_STARFIVE_JH8100_INTC) += irq-starfive-jh8100-intc.o obj-$(CONFIG_ACLINT_SSWI) += irq-aclint-sswi.o diff --git a/drivers/irqchip/irq-riscv-rpmi-sysmsi.c b/drivers/irqchip/irq-riscv-rpmi-sysmsi.c new file mode 100644 index 000000000000..92e8847dfccc --- /dev/null +++ b/drivers/irqchip/irq-riscv-rpmi-sysmsi.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2025 Ventana Micro Systems Inc. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct rpmi_sysmsi_get_attrs_rx { + __le32 status; + __le32 sys_num_msi; + __le32 flag0; + __le32 flag1; +}; + +#define RPMI_SYSMSI_MSI_ATTRIBUTES_FLAG0_PREF_PRIV BIT(0) + +struct rpmi_sysmsi_set_msi_state_tx { + __le32 sys_msi_index; + __le32 sys_msi_state; +}; + +struct rpmi_sysmsi_set_msi_state_rx { + __le32 status; +}; + +#define RPMI_SYSMSI_MSI_STATE_ENABLE BIT(0) +#define RPMI_SYSMSI_MSI_STATE_PENDING BIT(1) + +struct rpmi_sysmsi_set_msi_target_tx { + __le32 sys_msi_index; + __le32 sys_msi_address_low; + __le32 sys_msi_address_high; + __le32 sys_msi_data; +}; + +struct rpmi_sysmsi_set_msi_target_rx { + __le32 status; +}; + +struct rpmi_sysmsi_priv { + struct device *dev; + struct mbox_client client; + struct mbox_chan *chan; + u32 nr_irqs; + u32 gsi_base; +}; + +static int rpmi_sysmsi_get_num_msi(struct rpmi_sysmsi_priv *priv) +{ + struct rpmi_sysmsi_get_attrs_rx rx; + struct rpmi_mbox_message msg; + int ret; + + rpmi_mbox_init_send_with_response(&msg, RPMI_SYSMSI_SRV_GET_ATTRIBUTES, + NULL, 0, &rx, sizeof(rx)); + ret = rpmi_mbox_send_message(priv->chan, &msg); + if (ret) + return ret; + if (rx.status) + return rpmi_to_linux_error(le32_to_cpu(rx.status)); + + return le32_to_cpu(rx.sys_num_msi); +} + +static int rpmi_sysmsi_set_msi_state(struct rpmi_sysmsi_priv *priv, + u32 sys_msi_index, u32 sys_msi_state) +{ + struct rpmi_sysmsi_set_msi_state_tx tx; + struct rpmi_sysmsi_set_msi_state_rx rx; + struct rpmi_mbox_message msg; + int ret; + + tx.sys_msi_index = cpu_to_le32(sys_msi_index); + tx.sys_msi_state = cpu_to_le32(sys_msi_state); + rpmi_mbox_init_send_with_response(&msg, RPMI_SYSMSI_SRV_SET_MSI_STATE, + &tx, sizeof(tx), &rx, sizeof(rx)); + ret = rpmi_mbox_send_message(priv->chan, &msg); + if (ret) + return ret; + if (rx.status) + return rpmi_to_linux_error(le32_to_cpu(rx.status)); + + return 0; +} + +static int rpmi_sysmsi_set_msi_target(struct rpmi_sysmsi_priv *priv, + u32 sys_msi_index, struct msi_msg *m) +{ + struct rpmi_sysmsi_set_msi_target_tx tx; + struct rpmi_sysmsi_set_msi_target_rx rx; + struct rpmi_mbox_message msg; + int ret; + + tx.sys_msi_index = cpu_to_le32(sys_msi_index); + tx.sys_msi_address_low = cpu_to_le32(m->address_lo); + tx.sys_msi_address_high = cpu_to_le32(m->address_hi); + tx.sys_msi_data = cpu_to_le32(m->data); + rpmi_mbox_init_send_with_response(&msg, RPMI_SYSMSI_SRV_SET_MSI_TARGET, + &tx, sizeof(tx), &rx, sizeof(rx)); + ret = rpmi_mbox_send_message(priv->chan, &msg); + if (ret) + return ret; + if (rx.status) + return rpmi_to_linux_error(le32_to_cpu(rx.status)); + + return 0; +} + +static void rpmi_sysmsi_irq_mask(struct irq_data *d) +{ + struct rpmi_sysmsi_priv *priv = irq_data_get_irq_chip_data(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + int ret; + + ret = rpmi_sysmsi_set_msi_state(priv, hwirq, 0); + if (ret) + dev_warn(priv->dev, "Failed to mask hwirq %lu (error %d)\n", hwirq, ret); + irq_chip_mask_parent(d); +} + +static void rpmi_sysmsi_irq_unmask(struct irq_data *d) +{ + struct rpmi_sysmsi_priv *priv = irq_data_get_irq_chip_data(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + int ret; + + irq_chip_unmask_parent(d); + ret = rpmi_sysmsi_set_msi_state(priv, hwirq, RPMI_SYSMSI_MSI_STATE_ENABLE); + if (ret) + dev_warn(priv->dev, "Failed to unmask hwirq %lu (error %d)\n", hwirq, ret); +} + +static void rpmi_sysmsi_write_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct rpmi_sysmsi_priv *priv = irq_data_get_irq_chip_data(d); + irq_hw_number_t hwirq = irqd_to_hwirq(d); + int ret; + + /* For zeroed MSI, do nothing as of now */ + if (!msg->address_hi && !msg->address_lo && !msg->data) + return; + + ret = rpmi_sysmsi_set_msi_target(priv, hwirq, msg); + if (ret) + dev_warn(priv->dev, "Failed to set target for hwirq %lu (error %d)\n", hwirq, ret); +} + +static void rpmi_sysmsi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) +{ + arg->desc = desc; + arg->hwirq = desc->data.icookie.value; +} + +static int rpmi_sysmsi_translate(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *hwirq, unsigned int *type) +{ + struct msi_domain_info *info = d->host_data; + struct rpmi_sysmsi_priv *priv = info->data; + + if (WARN_ON(fwspec->param_count < 1)) + return -EINVAL; + + /* For DT, gsi_base is always zero. */ + *hwirq = fwspec->param[0] - priv->gsi_base; + *type = IRQ_TYPE_NONE; + return 0; +} + +static const struct msi_domain_template rpmi_sysmsi_template = { + .chip = { + .name = "RPMI-SYSMSI", + .irq_mask = rpmi_sysmsi_irq_mask, + .irq_unmask = rpmi_sysmsi_irq_unmask, +#ifdef CONFIG_SMP + .irq_set_affinity = irq_chip_set_affinity_parent, +#endif + .irq_write_msi_msg = rpmi_sysmsi_write_msg, + .flags = IRQCHIP_SET_TYPE_MASKED | + IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_MASK_ON_SUSPEND, + }, + + .ops = { + .set_desc = rpmi_sysmsi_set_desc, + .msi_translate = rpmi_sysmsi_translate, + }, + + .info = { + .bus_token = DOMAIN_BUS_WIRED_TO_MSI, + .flags = MSI_FLAG_USE_DEV_FWNODE, + .handler = handle_simple_irq, + .handler_name = "simple", + }, +}; + +static int rpmi_sysmsi_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rpmi_sysmsi_priv *priv; + int rc; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + priv->dev = dev; + + /* Setup mailbox client */ + priv->client.dev = priv->dev; + priv->client.rx_callback = NULL; + priv->client.tx_block = false; + priv->client.knows_txdone = true; + priv->client.tx_tout = 0; + + /* Request mailbox channel */ + priv->chan = mbox_request_channel(&priv->client, 0); + if (IS_ERR(priv->chan)) + return PTR_ERR(priv->chan); + + /* Get number of system MSIs */ + rc = rpmi_sysmsi_get_num_msi(priv); + if (rc < 1) { + mbox_free_channel(priv->chan); + if (rc) + return dev_err_probe(dev, rc, "Failed to get number of system MSIs\n"); + else + return dev_err_probe(dev, -ENODEV, "No system MSIs found\n"); + } + priv->nr_irqs = rc; + + /* + * The device MSI domain for platform devices on RISC-V architecture + * is only available after the MSI controller driver is probed so, + * explicitly configure here. + */ + if (!dev_get_msi_domain(dev)) { + /* + * The device MSI domain for OF devices is only set at the + * time of populating/creating OF device. If the device MSI + * domain is discovered later after the OF device is created + * then we need to set it explicitly before using any platform + * MSI functions. + */ + if (dev_of_node(dev)) + of_msi_configure(dev, dev_of_node(dev)); + + if (!dev_get_msi_domain(dev)) { + mbox_free_channel(priv->chan); + return -EPROBE_DEFER; + } + } + + if (!msi_create_device_irq_domain(dev, MSI_DEFAULT_DOMAIN, + &rpmi_sysmsi_template, + priv->nr_irqs, priv, priv)) { + mbox_free_channel(priv->chan); + return dev_err_probe(dev, -ENOMEM, "failed to create MSI irq domain\n"); + } + + dev_info(dev, "%u system MSIs registered\n", priv->nr_irqs); + return 0; +} + +static const struct of_device_id rpmi_sysmsi_match[] = { + { .compatible = "riscv,rpmi-system-msi" }, + {} +}; + +static struct platform_driver rpmi_sysmsi_driver = { + .driver = { + .name = "rpmi-sysmsi", + .of_match_table = rpmi_sysmsi_match, + }, + .probe = rpmi_sysmsi_probe, +}; +builtin_platform_driver(rpmi_sysmsi_driver); diff --git a/include/linux/mailbox/riscv-rpmi-message.h b/include/linux/mailbox/riscv-rpmi-message.h index 8176d33747fe..e135c6564d0c 100644 --- a/include/linux/mailbox/riscv-rpmi-message.h +++ b/include/linux/mailbox/riscv-rpmi-message.h @@ -91,6 +91,7 @@ static inline int rpmi_to_linux_error(int rpmi_error) } /* RPMI service group IDs */ +#define RPMI_SRVGRP_SYSTEM_MSI 0x00002 #define RPMI_SRVGRP_CLOCK 0x00008 /* RPMI clock service IDs */ @@ -106,6 +107,18 @@ enum rpmi_clock_service_id { RPMI_CLK_SRV_ID_MAX_COUNT }; +/* RPMI system MSI service IDs */ +enum rpmi_sysmsi_service_id { + RPMI_SYSMSI_SRV_ENABLE_NOTIFICATION = 0x01, + RPMI_SYSMSI_SRV_GET_ATTRIBUTES = 0x02, + RPMI_SYSMSI_SRV_GET_MSI_ATTRIBUTES = 0x03, + RPMI_SYSMSI_SRV_SET_MSI_STATE = 0x04, + RPMI_SYSMSI_SRV_GET_MSI_STATE = 0x05, + RPMI_SYSMSI_SRV_SET_MSI_TARGET = 0x06, + RPMI_SYSMSI_SRV_GET_MSI_TARGET = 0x07, + RPMI_SYSMSI_SRV_ID_MAX_COUNT +}; + /* RPMI Linux mailbox attribute IDs */ enum rpmi_mbox_attribute_id { RPMI_MBOX_ATTR_SPEC_VERSION, -- cgit v1.2.3 From 1a2b423be6a89dd07d5fc27ea042be68697a6a49 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 14 Sep 2025 22:24:13 +0200 Subject: i2c: boardinfo: Annotate code used in init phase only Annotate two places in boardinfo code: - __i2c_first_dynamic_bus_num is set in init phase. Annotate it as __ro_after_init to prevent later changes. - i2c_register_board_info() is used in init phase only, so annotate it as __init, allowing to free the memory after init phase. This is safe, see comment: "done in board-specific init code near arch_initcall() time" Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-boardinfo.c | 4 ++-- include/linux/i2c.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-boardinfo.c b/drivers/i2c/i2c-boardinfo.c index 4df8ad092df3..338800321f8b 100644 --- a/drivers/i2c/i2c-boardinfo.c +++ b/drivers/i2c/i2c-boardinfo.c @@ -22,7 +22,7 @@ EXPORT_SYMBOL_GPL(__i2c_board_lock); LIST_HEAD(__i2c_board_list); EXPORT_SYMBOL_GPL(__i2c_board_list); -int __i2c_first_dynamic_bus_num; +int __i2c_first_dynamic_bus_num __ro_after_init; EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); @@ -48,7 +48,7 @@ EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); * The board info passed can safely be __initdata, but be careful of embedded * pointers (for platform_data, functions, etc) since that won't be copied. */ -int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) +int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) { int status; diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 20fd41b51d5c..11a19241e360 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -499,7 +499,7 @@ static inline struct i2c_client *i2c_verify_client(struct device *dev) * Modules for add-on boards must use other calls. */ #ifdef CONFIG_I2C_BOARDINFO -int +int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n); #else -- cgit v1.2.3 From 14967a9c7d247841b0312c48dcf8cd29e55a4cc8 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 15 Sep 2025 18:45:20 -0600 Subject: mm/hugetlb: fix copy_hugetlb_page_range() to use ->pt_share_count commit 59d9094df3d79 ("mm: hugetlb: independent PMD page table shared count") introduced ->pt_share_count dedicated to hugetlb PMD share count tracking, but omitted fixing copy_hugetlb_page_range(), leaving the function relying on page_count() for tracking that no longer works. When lazy page table copy for hugetlb is disabled, that is, revert commit bcd51a3c679d ("hugetlb: lazy page table copies in fork()") fork()'ing with hugetlb PMD sharing quickly lockup - [ 239.446559] watchdog: BUG: soft lockup - CPU#75 stuck for 27s! [ 239.446611] RIP: 0010:native_queued_spin_lock_slowpath+0x7e/0x2e0 [ 239.446631] Call Trace: [ 239.446633] [ 239.446636] _raw_spin_lock+0x3f/0x60 [ 239.446639] copy_hugetlb_page_range+0x258/0xb50 [ 239.446645] copy_page_range+0x22b/0x2c0 [ 239.446651] dup_mmap+0x3e2/0x770 [ 239.446654] dup_mm.constprop.0+0x5e/0x230 [ 239.446657] copy_process+0xd17/0x1760 [ 239.446660] kernel_clone+0xc0/0x3e0 [ 239.446661] __do_sys_clone+0x65/0xa0 [ 239.446664] do_syscall_64+0x82/0x930 [ 239.446668] ? count_memcg_events+0xd2/0x190 [ 239.446671] ? syscall_trace_enter+0x14e/0x1f0 [ 239.446676] ? syscall_exit_work+0x118/0x150 [ 239.446677] ? arch_exit_to_user_mode_prepare.constprop.0+0x9/0xb0 [ 239.446681] ? clear_bhb_loop+0x30/0x80 [ 239.446684] ? clear_bhb_loop+0x30/0x80 [ 239.446686] entry_SYSCALL_64_after_hwframe+0x76/0x7e There are two options to resolve the potential latent issue: 1. warn against PMD sharing in copy_hugetlb_page_range(), 2. fix it. This patch opts for the second option. While at it, simplify the comment, the details are not actually relevant anymore. Link: https://lkml.kernel.org/r/20250916004520.1604530-1-jane.chu@oracle.com Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: Jane Chu Reviewed-by: Harry Yoo Acked-by: Oscar Salvador Acked-by: David Hildenbrand Cc: Jann Horn Cc: Liu Shixin Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++++ mm/hugetlb.c | 15 +++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db93..a643fae8a349 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -631,6 +631,11 @@ static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) { return atomic_read(&ptdesc->pt_share_count); } + +static inline bool ptdesc_pmd_is_shared(struct ptdesc *ptdesc) +{ + return !!ptdesc_pmd_pts_count(ptdesc); +} #else static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eed59cfb5d21..6cfe0b43ab8f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5594,18 +5594,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* - * If the pagetables are shared don't copy or take references. - * - * dst_pte == src_pte is the common case of src/dest sharing. - * However, src could have 'unshared' and dst shares with - * another vma. So page_count of ptep page is checked instead - * to reliably determine whether pte is shared. - */ - if (page_count(virt_to_page(dst_pte)) > 1) { +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + /* If the pagetables are shared, there is nothing to do */ + if (ptdesc_pmd_is_shared(virt_to_ptdesc(dst_pte))) { addr |= last_addr_mask; continue; } +#endif dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -7602,7 +7597,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, hugetlb_vma_assert_locked(vma); if (sz != PMD_SIZE) return 0; - if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) + if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); -- cgit v1.2.3 From 7e89979f6695fb56e8739b7d19614256e637131d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 13 Sep 2025 17:03:39 -0700 Subject: include/linux/pgtable.h: convert arch_enter_lazy_mmu_mode() and friends to static inlines commit c519c3c0a113 ("mm/kasan: avoid lazy MMU mode hazards") introduced the use of arch_enter_lazy_mmu_mode(), which results in the compiler complaining about "statement has no effect", when __HAVE_ARCH_LAZY_MMU_MODE is not defined in include/linux/pgtable.h The exact warning/error is: In file included from ./include/linux/kasan.h:37, from mm/kasan/shadow.c:14: mm/kasan/shadow.c: In function kasan_populate_vmalloc_pte: ./include/linux/pgtable.h:247:41: error: statement with no effect [-Werror=unused-value] 247 | #define arch_enter_lazy_mmu_mode() (LAZY_MMU_DEFAULT) | ^ mm/kasan/shadow.c:322:9: note: in expansion of macro arch_enter_lazy_mmu_mode> 322 | arch_enter_lazy_mmu_mode(); | ^~~~~~~~~~~~~~~~~~~~~~~~ switching these "functions" to static inlines fixes this up. Fixes: c519c3c0a113 ("mm/kasan: avoid lazy MMU mode hazards") Reported-by: Balbir Singh Closes: https://lkml.kernel.org/r/20250912235515.367061-1-balbirs@nvidia.com Cc: Alexander Gordeev Cc: Andrey Ryabinin Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2b80fd456c8b..25a7257052ff 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -232,9 +232,9 @@ static inline int pmd_dirty(pmd_t pmd) * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) +static inline void arch_enter_lazy_mmu_mode(void) {} +static inline void arch_leave_lazy_mmu_mode(void) {} +static inline void arch_flush_lazy_mmu_mode(void) {} #endif #ifndef pte_batch_hint -- cgit v1.2.3 From 17f0d1f6321caa95699b8f96baf12e654d7b8d60 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Fri, 26 Sep 2025 01:50:28 +0800 Subject: bpf: Add lookup_and_delete_elem for BPF_MAP_STACK_TRACE The stacktrace map can be easily full, which will lead to failure in obtaining the stack. In addition to increasing the size of the map, another solution is to delete the stack_id after looking it up from the user, so extend the existing bpf_map_lookup_and_delete_elem() functionality to stacktrace map types. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250925175030.1615837-1-chen.dylane@linux.dev --- include/linux/bpf.h | 2 +- kernel/bpf/stackmap.c | 16 ++++++++++++++-- kernel/bpf/syscall.c | 8 +++++--- 3 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ea2ed6771cc6..6338e54a9b1f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2724,7 +2724,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, u64 flags); -int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); +int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, bool delete); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 3615c06b7dfa..2e182a3ac4ce 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -646,7 +646,15 @@ static void *stack_map_lookup_elem(struct bpf_map *map, void *key) } /* Called from syscall */ -int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +static int stack_map_lookup_and_delete_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return bpf_stackmap_extract(map, key, value, true); +} + +/* Called from syscall */ +int bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, + bool delete) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *old_bucket; @@ -663,7 +671,10 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) memcpy(value, bucket->data, trace_len); memset(value + trace_len, 0, map->value_size - trace_len); - old_bucket = xchg(&smap->buckets[id], bucket); + if (delete) + old_bucket = bucket; + else + old_bucket = xchg(&smap->buckets[id], bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; @@ -754,6 +765,7 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, .map_lookup_elem = stack_map_lookup_elem, + .map_lookup_and_delete_elem = stack_map_lookup_and_delete_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, .map_check_btf = map_check_no_btf, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index adb05d235011..a48fa86f82a7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -320,7 +320,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { - err = bpf_stackmap_copy(map, key, value); + err = bpf_stackmap_extract(map, key, value, false); } else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) { err = bpf_fd_array_map_lookup_elem(map, key, value); } else if (IS_FD_HASH(map)) { @@ -1666,7 +1666,8 @@ struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); -int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +int __weak bpf_stackmap_extract(struct bpf_map *map, void *key, void *value, + bool delete) { return -ENOTSUPP; } @@ -2197,7 +2198,8 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) } else if (map->map_type == BPF_MAP_TYPE_HASH || map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_STACK_TRACE) { if (!bpf_map_is_offloaded(map)) { bpf_disable_instrumentation(); rcu_read_lock(); -- cgit v1.2.3 From 212b0f07cf021575ec25e0b2336df77c7a4d2e68 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 3 Sep 2025 14:59:43 +0200 Subject: locking/local_lock: Expose dep_map in local_trylock_t. lockdep_is_held() macro assumes that "struct lockdep_map dep_map;" is a top level field of any lock that participates in LOCKDEP. Make it so for local_trylock_t. Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Alexei Starovoitov Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- include/linux/local_lock_internal.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index d80b5306a2c0..949de37700db 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -17,7 +17,10 @@ typedef struct { /* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ typedef struct { - local_lock_t llock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + struct task_struct *owner; +#endif u8 acquired; } local_trylock_t; @@ -31,7 +34,7 @@ typedef struct { .owner = NULL, # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ - .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) }, + LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { @@ -81,7 +84,7 @@ do { \ local_lock_debug_init(lock); \ } while (0) -#define __local_trylock_init(lock) __local_lock_init(lock.llock) +#define __local_trylock_init(lock) __local_lock_init((local_lock_t *)lock) #define __spinlock_nested_bh_init(lock) \ do { \ -- cgit v1.2.3 From 2d517aa09bbc4203f10cdee7e1d42f3bbdc1b1cd Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:45 +0200 Subject: slab: add opt-in caching layer of percpu sheaves Specifying a non-zero value for a new struct kmem_cache_args field sheaf_capacity will setup a caching layer of percpu arrays called sheaves of given capacity for the created cache. Allocations from the cache will allocate via the percpu sheaves (main or spare) as long as they have no NUMA node preference. Frees will also put the object back into one of the sheaves. When both percpu sheaves are found empty during an allocation, an empty sheaf may be replaced with a full one from the per-node barn. If none are available and the allocation is allowed to block, an empty sheaf is refilled from slab(s) by an internal bulk alloc operation. When both percpu sheaves are full during freeing, the barn can replace a full one with an empty one, unless over a full sheaves limit. In that case a sheaf is flushed to slab(s) by an internal bulk free operation. Flushing sheaves and barns is also wired to the existing cpu flushing and cache shrinking operations. The sheaves do not distinguish NUMA locality of the cached objects. If an allocation is requested with kmem_cache_alloc_node() (or a mempolicy with strict_numa mode enabled) with a specific node (not NUMA_NO_NODE), the sheaves are bypassed. The bulk operations exposed to slab users also try to utilize the sheaves as long as the necessary (full or empty) sheaves are available on the cpu or in the barn. Once depleted, they will fallback to bulk alloc/free to slabs directly to avoid double copying. The sheaf_capacity value is exported in sysfs for observability. Sysfs CONFIG_SLUB_STATS counters alloc_cpu_sheaf and free_cpu_sheaf count objects allocated or freed using the sheaves (and thus not counting towards the other alloc/free path counters). Counters sheaf_refill and sheaf_flush count objects filled or flushed from or to slab pages, and can be used to assess how effective the caching is. The refill and flush operations will also count towards the usual alloc_fastpath/slowpath, free_fastpath/slowpath and other counters for the backing slabs. For barn operations, barn_get and barn_put count how many full sheaves were get from or put to the barn, the _fail variants count how many such requests could not be satisfied mainly because the barn was either empty or full. While the barn also holds empty sheaves to make some operations easier, these are not as critical to mandate own counters. Finally, there are sheaf_alloc/sheaf_free counters. Access to the percpu sheaves is protected by local_trylock() when potential callers include irq context, and local_lock() otherwise (such as when we already know the gfp flags allow blocking). The trylock failures should be rare and we can easily fallback. Each per-NUMA-node barn has a spin_lock. When slub_debug is enabled for a cache with sheaf_capacity also specified, the latter is ignored so that allocations and frees reach the slow path where debugging hooks are processed. Similarly, we ignore it with CONFIG_SLUB_TINY which prefers low memory usage to performance. [boot failure: https://lore.kernel.org/all/583eacf5-c971-451a-9f76-fed0e341b815@linux.ibm.com/ ] Reported-and-tested-by: Venkat Rao Bagalkote Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 31 ++ mm/slab.h | 2 + mm/slab_common.c | 5 +- mm/slub.c | 1164 +++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 1142 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index d5a8ab98035c..49acbcdc6696 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -335,6 +335,37 @@ struct kmem_cache_args { * %NULL means no constructor. */ void (*ctor)(void *); + /** + * @sheaf_capacity: Enable sheaves of given capacity for the cache. + * + * With a non-zero value, allocations from the cache go through caching + * arrays called sheaves. Each cpu has a main sheaf that's always + * present, and a spare sheaf that may be not present. When both become + * empty, there's an attempt to replace an empty sheaf with a full sheaf + * from the per-node barn. + * + * When no full sheaf is available, and gfp flags allow blocking, a + * sheaf is allocated and filled from slab(s) using bulk allocation. + * Otherwise the allocation falls back to the normal operation + * allocating a single object from a slab. + * + * Analogically when freeing and both percpu sheaves are full, the barn + * may replace it with an empty sheaf, unless it's over capacity. In + * that case a sheaf is bulk freed to slab pages. + * + * The sheaves do not enforce NUMA placement of objects, so allocations + * via kmem_cache_alloc_node() with a node specified other than + * NUMA_NO_NODE will bypass them. + * + * Bulk allocation and free operations also try to use the cpu sheaves + * and barn, but fallback to using slab pages directly. + * + * When slub_debug is enabled for the cache, the sheaf_capacity argument + * is ignored. + * + * %0 means no sheaves will be created. + */ + unsigned int sheaf_capacity; }; struct kmem_cache *__kmem_cache_create_args(const char *name, diff --git a/mm/slab.h b/mm/slab.h index 248b34c839b7..206987ce44a4 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -235,6 +235,7 @@ struct kmem_cache { #ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; #endif + struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; @@ -248,6 +249,7 @@ struct kmem_cache { /* Number of per cpu partial slabs to keep around */ unsigned int cpu_partial_slabs; #endif + unsigned int sheaf_capacity; struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ diff --git a/mm/slab_common.c b/mm/slab_common.c index bfe7c40eeee1..e2b197e47866 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -163,6 +163,9 @@ int slab_unmergeable(struct kmem_cache *s) return 1; #endif + if (s->cpu_sheaves) + return 1; + /* * We may have set a slab to be unmergeable during bootstrap. */ @@ -321,7 +324,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, object_size - args->usersize < args->useroffset)) args->usersize = args->useroffset = 0; - if (!args->usersize) + if (!args->usersize && !args->sheaf_capacity) s = __kmem_cache_alias(name, object_size, args->align, flags, args->ctor); if (s) diff --git a/mm/slub.c b/mm/slub.c index 9f671ec76131..cba188b7e04d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -363,8 +363,10 @@ static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif enum stat_item { + ALLOC_PCS, /* Allocation from percpu sheaf */ ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_PCS, /* Free to percpu sheaf */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ @@ -389,6 +391,14 @@ enum stat_item { CPU_PARTIAL_FREE, /* Refill cpu partial on free */ CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + SHEAF_FLUSH, /* Objects flushed from a sheaf */ + SHEAF_REFILL, /* Objects refilled to a sheaf */ + SHEAF_ALLOC, /* Allocation of an empty sheaf */ + SHEAF_FREE, /* Freeing of an empty sheaf */ + BARN_GET, /* Got full sheaf from barn */ + BARN_GET_FAIL, /* Failed to get full sheaf from barn */ + BARN_PUT, /* Put full sheaf to barn */ + BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ NR_SLUB_STAT_ITEMS }; @@ -435,6 +445,32 @@ void stat_add(const struct kmem_cache *s, enum stat_item si, int v) #endif } +#define MAX_FULL_SHEAVES 10 +#define MAX_EMPTY_SHEAVES 10 + +struct node_barn { + spinlock_t lock; + struct list_head sheaves_full; + struct list_head sheaves_empty; + unsigned int nr_full; + unsigned int nr_empty; +}; + +struct slab_sheaf { + union { + struct rcu_head rcu_head; + struct list_head barn_list; + }; + unsigned int size; + void *objects[]; +}; + +struct slub_percpu_sheaves { + local_trylock_t lock; + struct slab_sheaf *main; /* never NULL when unlocked */ + struct slab_sheaf *spare; /* empty or full, may be NULL */ +}; + /* * The slab lists for all objects. */ @@ -447,6 +483,7 @@ struct kmem_cache_node { atomic_long_t total_objects; struct list_head full; #endif + struct node_barn *barn; }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) @@ -454,6 +491,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) return s->node[node]; } +/* Get the barn of the current cpu's memory node */ +static inline struct node_barn *get_barn(struct kmem_cache *s) +{ + return get_node(s, numa_mem_id())->barn; +} + /* * Iterator over all nodes. The body will be executed for each node that has * a kmem_cache_node structure allocated (which is true for all online nodes) @@ -470,12 +513,19 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) */ static nodemask_t slab_nodes; -#ifndef CONFIG_SLUB_TINY /* * Workqueue used for flush_cpu_slab(). */ static struct workqueue_struct *flushwq; -#endif + +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); /******************************************************************** * Core slab cache functions @@ -2473,6 +2523,360 @@ static void *setup_object(struct kmem_cache *s, void *object) return object; } +static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, + s->sheaf_capacity), gfp); + + if (unlikely(!sheaf)) + return NULL; + + stat(s, SHEAF_ALLOC); + + return sheaf; +} + +static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + kfree(sheaf); + + stat(s, SHEAF_FREE); +} + +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + + +static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, + gfp_t gfp) +{ + int to_fill = s->sheaf_capacity - sheaf->size; + int filled; + + if (!to_fill) + return 0; + + filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, + &sheaf->objects[sheaf->size]); + + sheaf->size += filled; + + stat_add(s, SHEAF_REFILL, filled); + + if (filled < to_fill) + return -ENOMEM; + + return 0; +} + + +static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); + + if (!sheaf) + return NULL; + + if (refill_sheaf(s, sheaf, gfp)) { + free_empty_sheaf(s, sheaf); + return NULL; + } + + return sheaf; +} + +/* + * Maximum number of objects freed during a single flush of main pcs sheaf. + * Translates directly to an on-stack array size. + */ +#define PCS_BATCH_MAX 32U + +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); + +/* + * Free all objects from the main sheaf. In order to perform + * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where + * object pointers are moved to a on-stack array under the lock. To bound the + * stack usage, limit each batch to PCS_BATCH_MAX. + * + * returns true if at least partially flushed + */ +static bool sheaf_flush_main(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + unsigned int batch, remaining; + void *objects[PCS_BATCH_MAX]; + struct slab_sheaf *sheaf; + bool ret = false; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return ret; + + pcs = this_cpu_ptr(s->cpu_sheaves); + sheaf = pcs->main; + + batch = min(PCS_BATCH_MAX, sheaf->size); + + sheaf->size -= batch; + memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *)); + + remaining = sheaf->size; + + local_unlock(&s->cpu_sheaves->lock); + + __kmem_cache_free_bulk(s, batch, &objects[0]); + + stat_add(s, SHEAF_FLUSH, batch); + + ret = true; + + if (remaining) + goto next_batch; + + return ret; +} + +/* + * Free all objects from a sheaf that's unused, i.e. not linked to any + * cpu_sheaves, so we need no locking and batching. The locking is also not + * necessary when flushing cpu's sheaves (both spare and main) during cpu + * hotremove as the cpu is not executing anymore. + */ +static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + if (!sheaf->size) + return; + + stat_add(s, SHEAF_FLUSH, sheaf->size); + + __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); + + sheaf->size = 0; +} + +/* + * Caller needs to make sure migration is disabled in order to fully flush + * single cpu's sheaves + * + * must not be called from an irq + * + * flushing operations are rare so let's keep it simple and flush to slabs + * directly, skipping the barn + */ +static void pcs_flush_all(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *spare; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + spare = pcs->spare; + pcs->spare = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (spare) { + sheaf_flush_unused(s, spare); + free_empty_sheaf(s, spare); + } + + sheaf_flush_main(s); +} + +static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) +{ + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* The cpu is not executing anymore so we don't need pcs->lock */ + sheaf_flush_unused(s, pcs->main); + if (pcs->spare) { + sheaf_flush_unused(s, pcs->spare); + free_empty_sheaf(s, pcs->spare); + pcs->spare = NULL; + } +} + +static void pcs_destroy(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* can happen when unwinding failed create */ + if (!pcs->main) + continue; + + /* + * We have already passed __kmem_cache_shutdown() so everything + * was flushed and there should be no objects allocated from + * slabs, otherwise kmem_cache_destroy() would have aborted. + * Therefore something would have to be really wrong if the + * warnings here trigger, and we should rather leave objects and + * sheaves to leak in that case. + */ + + WARN_ON(pcs->spare); + + if (!WARN_ON(pcs->main->size)) { + free_empty_sheaf(s, pcs->main); + pcs->main = NULL; + } + } + + free_percpu(s->cpu_sheaves); + s->cpu_sheaves = NULL; +} + +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *empty = NULL; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_empty) { + empty = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&empty->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +/* + * The following two functions are used mainly in cases where we have to undo an + * intended action due to a race or cpu migration. Thus they do not check the + * empty or full sheaf limits for simplicity. + */ + +static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_empty); + barn->nr_empty++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_full); + barn->nr_full++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +/* + * If a full sheaf is available, return it and put the supplied empty one to + * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't + * change. + */ +static struct slab_sheaf * +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +{ + struct slab_sheaf *full = NULL; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&full->barn_list); + list_add(&empty->barn_list, &barn->sheaves_empty); + barn->nr_full--; + barn->nr_empty++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return full; +} + +/* + * If an empty sheaf is available, return it and put the supplied full one to + * barn. But if there are too many full sheaves, reject this with -E2BIG. + */ +static struct slab_sheaf * +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +{ + struct slab_sheaf *empty; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full >= MAX_FULL_SHEAVES) { + empty = ERR_PTR(-E2BIG); + } else if (!barn->nr_empty) { + empty = ERR_PTR(-ENOMEM); + } else { + empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, + barn_list); + list_del(&empty->barn_list); + list_add(&full->barn_list, &barn->sheaves_full); + barn->nr_empty--; + barn->nr_full++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +static void barn_init(struct node_barn *barn) +{ + spin_lock_init(&barn->lock); + INIT_LIST_HEAD(&barn->sheaves_full); + INIT_LIST_HEAD(&barn->sheaves_empty); + barn->nr_full = 0; + barn->nr_empty = 0; +} + +static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) +{ + struct list_head empty_list; + struct list_head full_list; + struct slab_sheaf *sheaf, *sheaf2; + unsigned long flags; + + INIT_LIST_HEAD(&empty_list); + INIT_LIST_HEAD(&full_list); + + spin_lock_irqsave(&barn->lock, flags); + + list_splice_init(&barn->sheaves_full, &full_list); + barn->nr_full = 0; + list_splice_init(&barn->sheaves_empty, &empty_list); + barn->nr_empty = 0; + + spin_unlock_irqrestore(&barn->lock, flags); + + list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + } + + list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list) + free_empty_sheaf(s, sheaf); +} + /* * Slab allocation and freeing */ @@ -3344,11 +3748,40 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) put_partials_cpu(s, c); } -struct slub_flush_work { - struct work_struct work; - struct kmem_cache *s; - bool skip; -}; +static inline void flush_this_cpu_slab(struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + + if (c->slab) + flush_slab(s, c); + + put_partials(s); +} + +static bool has_cpu_slab(int cpu, struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->slab || slub_percpu_partial(c); +} + +#else /* CONFIG_SLUB_TINY */ +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } +static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } +static inline void flush_this_cpu_slab(struct kmem_cache *s) { } +#endif /* CONFIG_SLUB_TINY */ + +static bool has_pcs_used(int cpu, struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + + if (!s->cpu_sheaves) + return false; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + return (pcs->spare || pcs->main->size); +} /* * Flush cpu slab. @@ -3358,30 +3791,18 @@ struct slub_flush_work { static void flush_cpu_slab(struct work_struct *w) { struct kmem_cache *s; - struct kmem_cache_cpu *c; struct slub_flush_work *sfw; sfw = container_of(w, struct slub_flush_work, work); s = sfw->s; - c = this_cpu_ptr(s->cpu_slab); - - if (c->slab) - flush_slab(s, c); - - put_partials(s); -} -static bool has_cpu_slab(int cpu, struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + if (s->cpu_sheaves) + pcs_flush_all(s); - return c->slab || slub_percpu_partial(c); + flush_this_cpu_slab(s); } -static DEFINE_MUTEX(flush_lock); -static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); - static void flush_all_cpus_locked(struct kmem_cache *s) { struct slub_flush_work *sfw; @@ -3392,7 +3813,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s)) { + if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { sfw->skip = true; continue; } @@ -3428,19 +3849,15 @@ static int slub_cpu_dead(unsigned int cpu) struct kmem_cache *s; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) + list_for_each_entry(s, &slab_caches, list) { __flush_cpu_slab(s, cpu); + if (s->cpu_sheaves) + __pcs_flush_all_cpu(s, cpu); + } mutex_unlock(&slab_mutex); return 0; } -#else /* CONFIG_SLUB_TINY */ -static inline void flush_all_cpus_locked(struct kmem_cache *s) { } -static inline void flush_all(struct kmem_cache *s) { } -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } -static inline int slub_cpu_dead(unsigned int cpu) { return 0; } -#endif /* CONFIG_SLUB_TINY */ - /* * Check if the objects in a per cpu structure fit numa * locality expectations. @@ -4191,30 +4608,240 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, } /* - * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) - * have the fastpath folded into their functions. So no function call - * overhead for requests that can be satisfied on the fastpath. - * - * The fastpath works by first checking if the lockless freelist can be used. - * If not then __slab_alloc is called for slow processing. + * Replace the empty main sheaf with a (at least partially) full sheaf. * - * Otherwise we can simply pick the next object from the lockless free list. + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. */ -static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +static struct slub_percpu_sheaves * +__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp) { - void *object; - bool init = false; + struct slab_sheaf *empty = NULL; + struct slab_sheaf *full; + struct node_barn *barn; + bool can_alloc; - s = slab_pre_alloc_hook(s, gfpflags); - if (unlikely(!s)) - return NULL; + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + return pcs; + } + + barn = get_barn(s); + + full = barn_replace_empty_sheaf(barn, pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + return pcs; + } + + stat(s, BARN_GET_FAIL); + + can_alloc = gfpflags_allow_blocking(gfp); + + if (can_alloc) { + if (pcs->spare) { + empty = pcs->spare; + pcs->spare = NULL; + } else { + empty = barn_get_empty_sheaf(barn); + } + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!can_alloc) + return NULL; + + if (empty) { + if (!refill_sheaf(s, empty, gfp)) { + full = empty; + } else { + /* + * we must be very low on memory so don't bother + * with the barn + */ + free_empty_sheaf(s, empty); + } + } else { + full = alloc_full_sheaf(s, gfp); + } + + if (!full) + return NULL; + + /* + * we can reach here only when gfpflags_allow_blocking + * so this must not be an irq + */ + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * If we are returning empty sheaf, we either got it from the + * barn or had to allocate one. If we are returning a full + * sheaf, it's due to racing or being migrated to a different + * cpu. Breaching the barn's sheaf limits should be thus rare + * enough so just ignore them to simplify the recovery. + */ + + if (pcs->main->size == 0) { + barn_put_empty_sheaf(barn, pcs->main); + pcs->main = full; + return pcs; + } + + if (!pcs->spare) { + pcs->spare = full; + return pcs; + } + + if (pcs->spare->size == 0) { + barn_put_empty_sheaf(barn, pcs->spare); + pcs->spare = full; + return pcs; + } + + barn_put_full_sheaf(barn, full); + stat(s, BARN_PUT); + + return pcs; +} + +static __fastpath_inline +void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp) +{ + struct slub_percpu_sheaves *pcs; + void *object; + +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa)) { + if (current->mempolicy) + return NULL; + } +#endif + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + pcs = __pcs_replace_empty_main(s, pcs, gfp); + if (unlikely(!pcs)) + return NULL; + } + + object = pcs->main->objects[--pcs->main->size]; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, ALLOC_PCS); + + return object; +} + +static __fastpath_inline +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main; + unsigned int allocated = 0; + unsigned int batch; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return allocated; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + + struct slab_sheaf *full; + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + goto do_alloc; + } + + full = barn_replace_empty_sheaf(get_barn(s), pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + goto do_alloc; + } + + stat(s, BARN_GET_FAIL); + + local_unlock(&s->cpu_sheaves->lock); + + /* + * Once full sheaves in barn are depleted, let the bulk + * allocation continue from slab pages, otherwise we would just + * be copying arrays of pointers twice. + */ + return allocated; + } + +do_alloc: + + main = pcs->main; + batch = min(size, main->size); + + main->size -= batch; + memcpy(p, main->objects + main->size, batch * sizeof(void *)); + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, ALLOC_PCS, batch); + + allocated += batch; + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return allocated; +} + + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + void *object; + bool init = false; + + s = slab_pre_alloc_hook(s, gfpflags); + if (unlikely(!s)) + return NULL; object = kfence_alloc(s, orig_size, gfpflags); if (unlikely(object)) goto out; - object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + if (s->cpu_sheaves && node == NUMA_NO_NODE) + object = alloc_from_pcs(s, gfpflags); + + if (!object) + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); @@ -4591,6 +5218,295 @@ slab_empty: discard_slab(s, slab); } +/* + * pcs is locked. We should have get rid of the spare sheaf and obtained an + * empty sheaf, while the main sheaf is full. We want to install the empty sheaf + * as a main sheaf, and make the current main sheaf a spare sheaf. + * + * However due to having relinquished the cpu_sheaves lock when obtaining + * the empty sheaf, we need to handle some unlikely but possible cases. + * + * If we put any sheaf to barn here, it's because we were interrupted or have + * been migrated to a different cpu, which should be rare enough so just ignore + * the barn's limits to simplify the handling. + * + * An alternative scenario that gets us here is when we fail + * barn_replace_full_sheaf(), because there's no empty sheaf available in the + * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the + * limit on full sheaves was not exceeded, we assume it didn't change and just + * put the full sheaf there. + */ +static void __pcs_install_empty_sheaf(struct kmem_cache *s, + struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty) +{ + struct node_barn *barn; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + /* This is what we expect to find if nobody interrupted us. */ + if (likely(!pcs->spare)) { + pcs->spare = pcs->main; + pcs->main = empty; + return; + } + + barn = get_barn(s); + + /* + * Unlikely because if the main sheaf had space, we would have just + * freed to it. Get rid of our empty sheaf. + */ + if (pcs->main->size < s->sheaf_capacity) { + barn_put_empty_sheaf(barn, empty); + return; + } + + /* Also unlikely for the same reason */ + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + barn_put_empty_sheaf(barn, empty); + return; + } + + /* + * We probably failed barn_replace_full_sheaf() due to no empty sheaf + * available there, but we allocated one, so finish the job. + */ + barn_put_full_sheaf(barn, pcs->main); + stat(s, BARN_PUT); + pcs->main = empty; +} + +/* + * Replace the full main sheaf with a (at least partially) empty sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +{ + struct slab_sheaf *empty; + struct node_barn *barn; + bool put_fail; + +restart: + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + barn = get_barn(s); + put_fail = false; + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (empty) { + pcs->spare = pcs->main; + pcs->main = empty; + return pcs; + } + goto alloc_empty; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + return pcs; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + + if (!IS_ERR(empty)) { + stat(s, BARN_PUT); + pcs->main = empty; + return pcs; + } + + if (PTR_ERR(empty) == -E2BIG) { + /* Since we got here, spare exists and is full */ + struct slab_sheaf *to_flush = pcs->spare; + + stat(s, BARN_PUT_FAIL); + + pcs->spare = NULL; + local_unlock(&s->cpu_sheaves->lock); + + sheaf_flush_unused(s, to_flush); + empty = to_flush; + goto got_empty; + } + + /* + * We could not replace full sheaf because barn had no empty + * sheaves. We can still allocate it and put the full sheaf in + * __pcs_install_empty_sheaf(), but if we fail to allocate it, + * make sure to count the fail. + */ + put_fail = true; + +alloc_empty: + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + if (empty) + goto got_empty; + + if (put_fail) + stat(s, BARN_PUT_FAIL); + + if (!sheaf_flush_main(s)) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * we flushed the main sheaf so it should be empty now, + * but in case we got preempted or migrated, we need to + * check again + */ + if (pcs->main->size == s->sheaf_capacity) + goto restart; + + return pcs; + +got_empty: + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + return NULL; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + __pcs_install_empty_sheaf(s, pcs, empty); + + return pcs; +} + +/* + * Free an object to the percpu sheaves. + * The object is expected to have passed slab_free_hook() already. + */ +static __fastpath_inline +bool free_to_pcs(struct kmem_cache *s, void *object) +{ + struct slub_percpu_sheaves *pcs; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return false; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == s->sheaf_capacity)) { + + pcs = __pcs_replace_full_main(s, pcs); + if (unlikely(!pcs)) + return false; + } + + pcs->main->objects[pcs->main->size++] = object; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_PCS); + + return true; +} + +/* + * Bulk free objects to the percpu sheaves. + * Unlike free_to_pcs() this includes the calls to all necessary hooks + * and the fallback to freeing to slab pages. + */ +static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main, *empty; + bool init = slab_want_init_on_free(s); + unsigned int batch, i = 0; + struct node_barn *barn; + + while (i < size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, false))) { + p[i] = p[--size]; + if (!size) + return; + continue; + } + + i++; + } + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fallback; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (likely(pcs->main->size < s->sheaf_capacity)) + goto do_free; + + barn = get_barn(s); + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (!empty) + goto no_empty; + + pcs->spare = pcs->main; + pcs->main = empty; + goto do_free; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + goto do_free; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + if (IS_ERR(empty)) { + stat(s, BARN_PUT_FAIL); + goto no_empty; + } + + stat(s, BARN_PUT); + pcs->main = empty; + +do_free: + main = pcs->main; + batch = min(size, s->sheaf_capacity - main->size); + + memcpy(main->objects + main->size, p, batch * sizeof(void *)); + main->size += batch; + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, FREE_PCS, batch); + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return; + +no_empty: + local_unlock(&s->cpu_sheaves->lock); + + /* + * if we depleted all empty sheaves in the barn or there are too + * many full sheaves, free the rest to slab pages + */ +fallback: + __kmem_cache_free_bulk(s, size, p); +} + #ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that @@ -4677,7 +5593,10 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, memcg_slab_free_hook(s, slab, &object, 1); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) + return; + + if (!s->cpu_sheaves || !free_to_pcs(s, object)) do_slab_free(s, slab, object, object, 1, addr); } @@ -5273,6 +6192,15 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!size) return; + /* + * freeing to sheaves is so incompatible with the detached freelist so + * once we go that way, we have to do everything differently + */ + if (s && s->cpu_sheaves) { + free_to_pcs_bulk(s, size, p); + return; + } + do { struct detached_freelist df; @@ -5391,7 +6319,7 @@ error: int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { - int i; + unsigned int i = 0; if (!size) return 0; @@ -5400,9 +6328,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; - i = __kmem_cache_alloc_bulk(s, flags, size, p); - if (unlikely(i == 0)) - return 0; + if (s->cpu_sheaves) + i = alloc_from_pcs_bulk(s, size, p); + + if (i < size) { + /* + * If we ran out of memory, don't bother with freeing back to + * the percpu sheaves, we have bigger problems. + */ + if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { + if (i > 0) + __kmem_cache_free_bulk(s, i, p); + return 0; + } + } /* * memcg and kmem_cache debug support and memory initialization. @@ -5412,11 +6351,11 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, slab_want_init_on_alloc(flags, s), s->object_size))) { return 0; } - return i; + + return size; } EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); - /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -5550,7 +6489,7 @@ static inline int calculate_order(unsigned int size) } static void -init_kmem_cache_node(struct kmem_cache_node *n) +init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -5560,6 +6499,9 @@ init_kmem_cache_node(struct kmem_cache_node *n) atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif + n->barn = barn; + if (barn) + barn_init(barn); } #ifndef CONFIG_SLUB_TINY @@ -5590,6 +6532,26 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) } #endif /* CONFIG_SLUB_TINY */ +static int init_percpu_sheaves(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + local_trylock_init(&pcs->lock); + + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + + if (!pcs->main) + return -ENOMEM; + } + + return 0; +} + static struct kmem_cache *kmem_cache_node; /* @@ -5625,7 +6587,7 @@ static void early_kmem_cache_node_alloc(int node) slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; kmem_cache_node->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, NULL); inc_slabs_node(kmem_cache_node, node, slab->objects); /* @@ -5641,6 +6603,13 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) { + if (n->barn) { + WARN_ON(n->barn->nr_full); + WARN_ON(n->barn->nr_empty); + kfree(n->barn); + n->barn = NULL; + } + s->node[node] = NULL; kmem_cache_free(kmem_cache_node, n); } @@ -5649,6 +6618,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); + if (s->cpu_sheaves) + pcs_destroy(s); #ifndef CONFIG_SLUB_TINY free_percpu(s->cpu_slab); #endif @@ -5661,18 +6632,29 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; + struct node_barn *barn = NULL; if (slab_state == DOWN) { early_kmem_cache_node_alloc(node); continue; } + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) + return 0; + } + n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); - - if (!n) + if (!n) { + kfree(barn); return 0; + } + + init_kmem_cache_node(n, barn); - init_kmem_cache_node(n); s->node[node] = n; } return 1; @@ -5929,6 +6911,8 @@ int __kmem_cache_shutdown(struct kmem_cache *s) flush_all_cpus_locked(s); /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { + if (n->barn) + barn_shrink(s, n->barn); free_partial(s, n); if (n->nr_partial || node_nr_slabs(n)) return 1; @@ -6132,6 +7116,9 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); + if (n->barn) + barn_shrink(s, n->barn); + spin_lock_irqsave(&n->list_lock, flags); /* @@ -6211,12 +7198,24 @@ static int slab_mem_going_online_callback(int nid) */ mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { + struct node_barn *barn = NULL; + /* * The structure may already exist if the node was previously * onlined and offlined. */ if (get_node(s, nid)) continue; + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); + + if (!barn) { + ret = -ENOMEM; + goto out; + } + } + /* * XXX: kmem_cache_alloc_node will fallback to other nodes * since memory is not yet available from the node that @@ -6224,10 +7223,13 @@ static int slab_mem_going_online_callback(int nid) */ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { + kfree(barn); ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + + init_kmem_cache_node(n, barn); + s->node[nid] = n; } /* @@ -6440,6 +7442,17 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, set_cpu_partial(s); + if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) + && !(s->flags & SLAB_DEBUG_FLAGS)) { + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; + } + // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? + s->sheaf_capacity = args->sheaf_capacity; + } + #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -6456,6 +7469,12 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!alloc_kmem_cache_cpus(s)) goto out; + if (s->cpu_sheaves) { + err = init_percpu_sheaves(s); + if (err) + goto out; + } + err = 0; /* Mutex is not taken during early boot */ @@ -6908,6 +7927,12 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(order); +static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->sheaf_capacity); +} +SLAB_ATTR_RO(sheaf_capacity); + static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%lu\n", s->min_partial); @@ -7255,8 +8280,10 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ +STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_PCS, free_cpu_sheaf); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); STAT_ATTR(FREE_FROZEN, free_frozen); @@ -7281,6 +8308,14 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); +STAT_ATTR(SHEAF_FLUSH, sheaf_flush); +STAT_ATTR(SHEAF_REFILL, sheaf_refill); +STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); +STAT_ATTR(SHEAF_FREE, sheaf_free); +STAT_ATTR(BARN_GET, barn_get); +STAT_ATTR(BARN_GET_FAIL, barn_get_fail); +STAT_ATTR(BARN_PUT, barn_put); +STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); #endif /* CONFIG_SLUB_STATS */ #ifdef CONFIG_KFENCE @@ -7311,6 +8346,7 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &sheaf_capacity_attr.attr, &min_partial_attr.attr, &cpu_partial_attr.attr, &objects_partial_attr.attr, @@ -7342,8 +8378,10 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS + &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, + &free_cpu_sheaf_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, &free_frozen_attr.attr, @@ -7368,6 +8406,14 @@ static struct attribute *slab_attrs[] = { &cpu_partial_free_attr.attr, &cpu_partial_node_attr.attr, &cpu_partial_drain_attr.attr, + &sheaf_flush_attr.attr, + &sheaf_refill_attr.attr, + &sheaf_alloc_attr.attr, + &sheaf_free_attr.attr, + &barn_get_attr.attr, + &barn_get_fail_attr.attr, + &barn_put_attr.attr, + &barn_put_fail_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, -- cgit v1.2.3 From bbfe987c5a2854705393ad79813074e5eadcbde6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 13:10:22 +0200 Subject: PM: hibernate: Fix pm_hibernation_mode_is_suspend() build breakage Commit 495c8d35035e ("PM: hibernate: Add pm_hibernation_mode_is_suspend()") that introduced pm_hibernation_mode_is_suspend() did not define it in the case when CONFIG_HIBERNATION is unset, but CONFIG_SUSPEND is set. Subsequent commit 0a6e9e098fcc ("drm/amd: Fix hybrid sleep") made the amdgpu driver use that function which led to kernel build breakage in the case mentioned above [1]. Address this by using appropriate #ifdeffery around the definition of pm_hibernation_mode_is_suspend(). Fixes: 0a6e9e098fcc ("drm/amd: Fix hybrid sleep") Reported-by: KernelCI bot Closes: https://groups.io/g/kernelci-results/topic/regression_pm_testing/115439919 [1] Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) --- include/linux/suspend.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 0664c685f0b2..b02876f1ae38 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -276,7 +276,6 @@ extern void arch_suspend_enable_irqs(void); extern int pm_suspend(suspend_state_t state); extern bool sync_on_suspend_enabled; -bool pm_hibernation_mode_is_suspend(void); #else /* !CONFIG_SUSPEND */ #define suspend_valid_only_mem NULL @@ -289,7 +288,6 @@ static inline bool pm_suspend_via_firmware(void) { return false; } static inline bool pm_resume_via_firmware(void) { return false; } static inline bool pm_suspend_no_platform(void) { return false; } static inline bool pm_suspend_default_s2idle(void) { return false; } -static inline bool pm_hibernation_mode_is_suspend(void) { return false; } static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } @@ -420,6 +418,12 @@ static inline int hibernate_quiet_exec(int (*func)(void *data), void *data) { } #endif /* CONFIG_HIBERNATION */ +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_SUSPEND) +bool pm_hibernation_mode_is_suspend(void); +#else +static inline bool pm_hibernation_mode_is_suspend(void) { return false; } +#endif + int arch_resume_nosmt(void); #ifdef CONFIG_HIBERNATION_SNAPSHOT_DEV -- cgit v1.2.3 From a036bb0e60ad2828c3498bff7465bcbb247b7436 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 25 Sep 2025 14:56:30 -0500 Subject: of: base: Add of_get_next_child_with_prefix() stub 1fcc67e3a354 ("of: base: Add for_each_child_of_node_with_prefix()") added of_get_next_child_with_prefix() but did not add a stub for the !CONFIG_OF case. Add a of_get_next_child_with_prefix() stub so users of for_each_child_of_node_with_prefix() can be built for compile testing even when !CONFIG_OF. Fixes: 1fcc67e3a354 ("of: base: Add for_each_child_of_node_with_prefix()") Signed-off-by: Bjorn Helgaas Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of.h b/include/linux/of.h index a62154aeda1b..5e2c6ed9370a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -550,6 +550,13 @@ static inline struct device_node *of_get_next_child( return NULL; } +static inline struct device_node *of_get_next_child_with_prefix( + const struct device_node *node, struct device_node *prev, + const char *prefix) +{ + return NULL; +} + static inline struct device_node *of_get_next_available_child( const struct device_node *node, struct device_node *prev) { -- cgit v1.2.3 From 4e9510f16218802b5fc0d593d8707d4e7ebf9774 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Sep 2025 01:27:07 -0400 Subject: ptr_ring: drop duplicated tail zeroing code We have some rather subtle code around zeroing tail entries, minimizing cache bouncing. Let's put it all in one place. Doing this also reduces the text size slightly, e.g. for drivers/vhost/net.o Before: text: 15,114 bytes After: text: 15,082 bytes Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://patch.msgid.link/adb9d941de4a2b619ddb2be271a9939849e70687.1758690291.git.mst@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/ptr_ring.h | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 551329220e4f..a736b16859a6 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -243,6 +243,24 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r) return ret; } +/* Zero entries from tail to specified head. + * NB: if consumer_head can be >= r->size need to fixup tail later. + */ +static inline void __ptr_ring_zero_tail(struct ptr_ring *r, int consumer_head) +{ + int head = consumer_head - 1; + + /* Zero out entries in the reverse order: this way we touch the + * cache line that producer might currently be reading the last; + * producer won't make progress and touch other cache lines + * besides the first one until we write out all entries. + */ + while (likely(head >= r->consumer_tail)) + r->queue[head--] = NULL; + + r->consumer_tail = consumer_head; +} + /* Must only be called after __ptr_ring_peek returned !NULL */ static inline void __ptr_ring_discard_one(struct ptr_ring *r) { @@ -261,8 +279,7 @@ static inline void __ptr_ring_discard_one(struct ptr_ring *r) /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty * to work correctly. */ - int consumer_head = r->consumer_head; - int head = consumer_head++; + int consumer_head = r->consumer_head + 1; /* Once we have processed enough entries invalidate them in * the ring all at once so producer can reuse their space in the ring. @@ -270,16 +287,9 @@ static inline void __ptr_ring_discard_one(struct ptr_ring *r) * but helps keep the implementation simple. */ if (unlikely(consumer_head - r->consumer_tail >= r->batch || - consumer_head >= r->size)) { - /* Zero out entries in the reverse order: this way we touch the - * cache line that producer might currently be reading the last; - * producer won't make progress and touch other cache lines - * besides the first one until we write out all entries. - */ - while (likely(head >= r->consumer_tail)) - r->queue[head--] = NULL; - r->consumer_tail = consumer_head; - } + consumer_head >= r->size)) + __ptr_ring_zero_tail(r, consumer_head); + if (unlikely(consumer_head >= r->size)) { consumer_head = 0; r->consumer_tail = 0; @@ -513,7 +523,6 @@ static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, void (*destroy)(void *)) { unsigned long flags; - int head; spin_lock_irqsave(&r->consumer_lock, flags); spin_lock(&r->producer_lock); @@ -525,17 +534,14 @@ static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, * Clean out buffered entries (for simplicity). This way following code * can test entries for NULL and if not assume they are valid. */ - head = r->consumer_head - 1; - while (likely(head >= r->consumer_tail)) - r->queue[head--] = NULL; - r->consumer_tail = r->consumer_head; + __ptr_ring_zero_tail(r, r->consumer_head); /* * Go over entries in batch, start moving head back and copy entries. * Stop when we run into previously unconsumed entries. */ while (n) { - head = r->consumer_head - 1; + int head = r->consumer_head - 1; if (head < 0) head = r->size - 1; if (r->queue[head]) { -- cgit v1.2.3 From cc2f08129925b437bf28f7f7822f20dac083a87c Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 24 Sep 2025 12:40:33 +0000 Subject: ethtool: add FEC bins histogram report IEEE 802.3ck-2022 defines counters for FEC bins and 802.3df-2024 clarifies it a bit further. Implement reporting interface through as addition to FEC stats available in ethtool. Drivers can leave bin counter uninitialized if per-lane values are provided. In this case the core will recalculate summ for the bin. Signed-off-by: Vadim Fedorenko Reviewed-by: Aleksandr Loktionov Link: https://patch.msgid.link/20250924124037.1508846-2-vadim.fedorenko@linux.dev Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 29 +++++++++ Documentation/networking/ethtool-netlink.rst | 5 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 3 +- .../net/ethernet/fungible/funeth/funeth_ethtool.c | 3 +- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 3 +- drivers/net/ethernet/intel/ice/ice_ethtool.c | 4 +- .../ethernet/marvell/octeontx2/nic/otx2_ethtool.c | 3 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 +- drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c | 3 +- drivers/net/ethernet/sfc/ethtool.c | 3 +- drivers/net/ethernet/sfc/siena/ethtool.c | 3 +- drivers/net/netdevsim/ethtool.c | 25 +++++++- include/linux/ethtool.h | 25 +++++++- include/uapi/linux/ethtool_netlink_generated.h | 12 ++++ net/ethtool/fec.c | 75 +++++++++++++++++++++- 15 files changed, 186 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 7a7594713f1f..6a0fb1974513 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1219,6 +1219,30 @@ attribute-sets: name: udp-ports type: nest nested-attributes: tunnel-udp + - + name: fec-hist + attr-cnt-name: --ethtool-a-fec-hist-cnt + attributes: + - + name: pad + type: pad + - + name: bin-low + type: u32 + doc: Low bound of FEC bin (inclusive) + - + name: bin-high + type: u32 + doc: High bound of FEC bin (inclusive) + - + name: bin-val + type: uint + doc: Error count in the bin (optional if per-lane values exist) + - + name: bin-val-per-lane + type: binary + sub-type: u64 + doc: An array of per-lane error counters in the bin (optional) - name: fec-stat attr-cnt-name: __ethtool-a-fec-stat-cnt @@ -1242,6 +1266,11 @@ attribute-sets: name: corr-bits type: binary sub-type: u64 + - + name: hist + type: nest + multi-attr: True + nested-attributes: fec-hist - name: fec attr-cnt-name: __ethtool-a-fec-cnt diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index ab20c644af24..b270886c5f5d 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1541,6 +1541,11 @@ Drivers fill in the statistics in the following structure: .. kernel-doc:: include/linux/ethtool.h :identifiers: ethtool_fec_stats +Statistics may have FEC bins histogram attribute ``ETHTOOL_A_FEC_STAT_HIST`` +as defined in IEEE 802.3ck-2022 and 802.3df-2024. Nested attributes will have +the range of FEC errors in the bin (inclusive) and the amount of error events +in the bin. + FEC_SET ======= diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index be32ef8f5c96..41686a6f84b5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -3208,7 +3208,8 @@ static int bnxt_get_fecparam(struct net_device *dev, } static void bnxt_get_fec_stats(struct net_device *dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct bnxt *bp = netdev_priv(dev); u64 *rx; diff --git a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c index ba83dbf4ed22..1966dba512f8 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c @@ -930,7 +930,8 @@ static void fun_get_rmon_stats(struct net_device *netdev, } static void fun_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *stats) + struct ethtool_fec_stats *stats, + struct ethtool_fec_hist *hist) { const struct funeth_priv *fp = netdev_priv(netdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index a752d0e3db3a..a5eefa28454c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1659,7 +1659,8 @@ static void hns3_set_msglevel(struct net_device *netdev, u32 msg_level) } static void hns3_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct hnae3_handle *handle = hns3_get_handle(netdev); struct hnae3_ae_dev *ae_dev = hns3_get_ae_dev(handle); diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 348acd46a0ef..dc131779d426 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -4624,10 +4624,12 @@ static int ice_get_port_fec_stats(struct ice_hw *hw, u16 pcs_quad, u16 pcs_port, * ice_get_fec_stats - returns FEC correctable, uncorrectable stats per netdev * @netdev: network interface device structure * @fec_stats: buffer to hold FEC statistics for given port + * @hist: buffer to put FEC histogram statistics for given port * */ static void ice_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_port_topology port_topology; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c index 998c734ff839..b90e23dc49de 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c @@ -1283,7 +1283,8 @@ end: } static void otx2_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct otx2_nic *pfvf = netdev_priv(netdev); struct cgx_fw_data *rsp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index d507366d773e..bcc3bbb78cc9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1927,7 +1927,8 @@ static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) } static void mlx5e_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct mlx5e_priv *priv = netdev_priv(netdev); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index fecb8c602024..d55d2ac1c3b9 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -1718,7 +1718,8 @@ fbnic_get_pause_stats(struct net_device *netdev, static void fbnic_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct fbnic_net *fbn = netdev_priv(netdev); struct fbnic_phy_stats *phy_stats; diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 23c6a7df78d0..18fe5850a978 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -217,7 +217,8 @@ static int efx_ethtool_set_wol(struct net_device *net_dev, } static void efx_ethtool_get_fec_stats(struct net_device *net_dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct efx_nic *efx = efx_netdev_priv(net_dev); diff --git a/drivers/net/ethernet/sfc/siena/ethtool.c b/drivers/net/ethernet/sfc/siena/ethtool.c index 994909789bfe..8c3ebd0617fb 100644 --- a/drivers/net/ethernet/sfc/siena/ethtool.c +++ b/drivers/net/ethernet/sfc/siena/ethtool.c @@ -217,7 +217,8 @@ static int efx_ethtool_set_wol(struct net_device *net_dev, } static void efx_ethtool_get_fec_stats(struct net_device *net_dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct efx_nic *efx = netdev_priv(net_dev); diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index f631d90c428a..36a201533aae 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -165,11 +165,34 @@ nsim_set_fecparam(struct net_device *dev, struct ethtool_fecparam *fecparam) return 0; } +static const struct ethtool_fec_hist_range netdevsim_fec_ranges[] = { + { 0, 0}, + { 1, 3}, + { 4, 7}, + { 0, 0} +}; + static void -nsim_get_fec_stats(struct net_device *dev, struct ethtool_fec_stats *fec_stats) +nsim_get_fec_stats(struct net_device *dev, struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { + struct ethtool_fec_hist_value *values = hist->values; + + hist->ranges = netdevsim_fec_ranges; + fec_stats->corrected_blocks.total = 123; fec_stats->uncorrectable_blocks.total = 4; + + values[0].per_lane[0] = 125; + values[0].per_lane[1] = 120; + values[0].per_lane[2] = 100; + values[0].per_lane[3] = 100; + values[1].sum = 12; + values[2].sum = 2; + values[2].per_lane[0] = 2; + values[2].per_lane[1] = 0; + values[2].per_lane[2] = 0; + values[2].per_lane[3] = 0; } static int nsim_get_ts_info(struct net_device *dev, diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c869b7f8bce8..c2d8b4ec62eb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -492,7 +492,29 @@ struct ethtool_pause_stats { }; #define ETHTOOL_MAX_LANES 8 +/** + * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for + * the end-of-list marker, total 17 items + */ +#define ETHTOOL_FEC_HIST_MAX 17 +/** + * struct ethtool_fec_hist_range - error bits range for FEC histogram + * statistics + * @low: low bound of the bin (inclusive) + * @high: high bound of the bin (inclusive) + */ +struct ethtool_fec_hist_range { + u16 low; + u16 high; +}; +struct ethtool_fec_hist { + struct ethtool_fec_hist_value { + u64 sum; + u64 per_lane[ETHTOOL_MAX_LANES]; + } values[ETHTOOL_FEC_HIST_MAX]; + const struct ethtool_fec_hist_range *ranges; +}; /** * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC * @corrected_blocks: number of received blocks corrected by FEC @@ -1214,7 +1236,8 @@ struct ethtool_ops { int (*set_link_ksettings)(struct net_device *, const struct ethtool_link_ksettings *); void (*get_fec_stats)(struct net_device *dev, - struct ethtool_fec_stats *fec_stats); + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist); int (*get_fecparam)(struct net_device *, struct ethtool_fecparam *); int (*set_fecparam)(struct net_device *, diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index e3b8813465d7..0e8ac0d974e2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -561,12 +561,24 @@ enum { ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) }; +enum { + ETHTOOL_A_FEC_HIST_PAD = 1, + ETHTOOL_A_FEC_HIST_BIN_LOW, + ETHTOOL_A_FEC_HIST_BIN_HIGH, + ETHTOOL_A_FEC_HIST_BIN_VAL, + ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + + __ETHTOOL_A_FEC_HIST_CNT, + ETHTOOL_A_FEC_HIST_MAX = (__ETHTOOL_A_FEC_HIST_CNT - 1) +}; + enum { ETHTOOL_A_FEC_STAT_UNSPEC, ETHTOOL_A_FEC_STAT_PAD, ETHTOOL_A_FEC_STAT_CORRECTED, ETHTOOL_A_FEC_STAT_UNCORR, ETHTOOL_A_FEC_STAT_CORR_BITS, + ETHTOOL_A_FEC_STAT_HIST, __ETHTOOL_A_FEC_STAT_CNT, ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index e7d3f2c352a3..4669e74cbcaa 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -17,6 +17,7 @@ struct fec_reply_data { u64 stats[1 + ETHTOOL_MAX_LANES]; u8 cnt; } corr, uncorr, corr_bits; + struct ethtool_fec_hist fec_stat_hist; }; #define FEC_REPDATA(__reply_base) \ @@ -113,7 +114,10 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethtool_fec_stats stats; ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); - dev->ethtool_ops->get_fec_stats(dev, &stats); + ethtool_stats_init((u64 *)data->fec_stat_hist.values, + sizeof(data->fec_stat_hist.values) / 8); + dev->ethtool_ops->get_fec_stats(dev, &stats, + &data->fec_stat_hist); fec_stats_recalc(&data->corr, &stats.corrected_blocks); fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); @@ -157,13 +161,77 @@ static int fec_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ - if (req_base->flags & ETHTOOL_FLAG_STATS) + if (req_base->flags & ETHTOOL_FLAG_STATS) { len += 3 * nla_total_size_64bit(sizeof(u64) * (1 + ETHTOOL_MAX_LANES)); + /* add FEC bins information */ + len += (nla_total_size(0) + /* _A_FEC_HIST */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_LOW */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_HI */ + /* _A_FEC_HIST_BIN_VAL + per-lane values */ + nla_total_size_64bit(sizeof(u64)) + + nla_total_size_64bit(sizeof(u64) * ETHTOOL_MAX_LANES)) * + ETHTOOL_FEC_HIST_MAX; + } return len; } +static int fec_put_hist(struct sk_buff *skb, + const struct ethtool_fec_hist *hist) +{ + const struct ethtool_fec_hist_range *ranges = hist->ranges; + const struct ethtool_fec_hist_value *values = hist->values; + struct nlattr *nest; + int i, j; + u64 sum; + + if (!ranges) + return 0; + + for (i = 0; i < ETHTOOL_FEC_HIST_MAX; i++) { + if (i && !ranges[i].low && !ranges[i].high) + break; + + if (WARN_ON_ONCE(values[i].sum == ETHTOOL_STAT_NOT_SET && + values[i].per_lane[0] == ETHTOOL_STAT_NOT_SET)) + break; + + nest = nla_nest_start(skb, ETHTOOL_A_FEC_STAT_HIST); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_LOW, + ranges[i].low) || + nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_HIGH, + ranges[i].high)) + goto err_cancel_hist; + sum = 0; + for (j = 0; j < ETHTOOL_MAX_LANES; j++) { + if (values[i].per_lane[j] == ETHTOOL_STAT_NOT_SET) + break; + sum += values[i].per_lane[j]; + } + if (nla_put_uint(skb, ETHTOOL_A_FEC_HIST_BIN_VAL, + values[i].sum == ETHTOOL_STAT_NOT_SET ? + sum : values[i].sum)) + goto err_cancel_hist; + if (j && nla_put_64bit(skb, ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + sizeof(u64) * j, + values[i].per_lane, + ETHTOOL_A_FEC_HIST_PAD)) + goto err_cancel_hist; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel_hist: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) { struct nlattr *nest; @@ -183,6 +251,9 @@ static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) goto err_cancel; + if (fec_put_hist(skb, &data->fec_stat_hist)) + goto err_cancel; + nla_nest_end(skb, nest); return 0; -- cgit v1.2.3 From 105ce7ad57e492b75ab40f2dc591db645fadbaa2 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 24 Sep 2025 23:14:53 +0200 Subject: net: airoha: npu: Add a NPU callback to initialize flow stats Introduce a NPU callback to initialize flow stats and remove NPU stats initialization from airoha_npu_get routine. Add num_stats_entries to airoha_npu_ppe_stats_setup routine. This patch makes the code more readable since NPU statistic are now initialized on demand by the NPU consumer (at the moment NPU statistic are configured just by the airoha_eth driver). Moreover this patch allows the NPU consumer (PPE module) to explicitly enable/disable NPU flow stats. Signed-off-by: Lorenzo Bianconi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250924-airoha-npu-init-stats-callback-v1-1-88bdf3c941b2@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_npu.c | 24 ++++++------------------ drivers/net/ethernet/airoha/airoha_ppe.c | 19 +++++++++++++------ include/linux/soc/airoha/airoha_offload.h | 7 ++++--- 3 files changed, 23 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/airoha/airoha_npu.c b/drivers/net/ethernet/airoha/airoha_npu.c index e1d131d6115c..8c883f2b2d36 100644 --- a/drivers/net/ethernet/airoha/airoha_npu.c +++ b/drivers/net/ethernet/airoha/airoha_npu.c @@ -379,15 +379,13 @@ out: return err; } -static int airoha_npu_stats_setup(struct airoha_npu *npu, - dma_addr_t foe_stats_addr) +static int airoha_npu_ppe_stats_setup(struct airoha_npu *npu, + dma_addr_t foe_stats_addr, + u32 num_stats_entries) { - int err, size = PPE_STATS_NUM_ENTRIES * sizeof(*npu->stats); + int err, size = num_stats_entries * sizeof(*npu->stats); struct ppe_mbox_data *ppe_data; - if (!size) /* flow stats are disabled */ - return 0; - ppe_data = kzalloc(sizeof(*ppe_data), GFP_ATOMIC); if (!ppe_data) return -ENOMEM; @@ -542,7 +540,7 @@ static void airoha_npu_wlan_irq_disable(struct airoha_npu *npu, int q) regmap_clear_bits(npu->regmap, REG_IRQ_RXDONE(q), NPU_IRQ_RX_MASK(q)); } -struct airoha_npu *airoha_npu_get(struct device *dev, dma_addr_t *stats_addr) +struct airoha_npu *airoha_npu_get(struct device *dev) { struct platform_device *pdev; struct device_node *np; @@ -581,17 +579,6 @@ struct airoha_npu *airoha_npu_get(struct device *dev, dma_addr_t *stats_addr) goto error_module_put; } - if (stats_addr) { - int err; - - err = airoha_npu_stats_setup(npu, *stats_addr); - if (err) { - dev_err(dev, "failed to allocate npu stats buffer\n"); - npu = ERR_PTR(err); - goto error_module_put; - } - } - return npu; error_module_put: @@ -643,6 +630,7 @@ static int airoha_npu_probe(struct platform_device *pdev) npu->dev = dev; npu->ops.ppe_init = airoha_npu_ppe_init; npu->ops.ppe_deinit = airoha_npu_ppe_deinit; + npu->ops.ppe_init_stats = airoha_npu_ppe_stats_setup; npu->ops.ppe_flush_sram_entries = airoha_npu_ppe_flush_sram_entries; npu->ops.ppe_foe_commit_entry = airoha_npu_foe_commit_entry; npu->ops.wlan_init_reserved_memory = airoha_npu_wlan_init_memory; diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 78473527ff50..691361b25407 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -1243,12 +1243,11 @@ static int airoha_ppe_flush_sram_entries(struct airoha_ppe *ppe, static struct airoha_npu *airoha_ppe_npu_get(struct airoha_eth *eth) { - struct airoha_npu *npu = airoha_npu_get(eth->dev, - ð->ppe->foe_stats_dma); + struct airoha_npu *npu = airoha_npu_get(eth->dev); if (IS_ERR(npu)) { request_module("airoha-npu"); - npu = airoha_npu_get(eth->dev, ð->ppe->foe_stats_dma); + npu = airoha_npu_get(eth->dev); } return npu; @@ -1257,6 +1256,7 @@ static struct airoha_npu *airoha_ppe_npu_get(struct airoha_eth *eth) static int airoha_ppe_offload_setup(struct airoha_eth *eth) { struct airoha_npu *npu = airoha_ppe_npu_get(eth); + struct airoha_ppe *ppe = eth->ppe; int err; if (IS_ERR(npu)) @@ -1266,12 +1266,19 @@ static int airoha_ppe_offload_setup(struct airoha_eth *eth) if (err) goto error_npu_put; - airoha_ppe_hw_init(eth->ppe); - err = airoha_ppe_flush_sram_entries(eth->ppe, npu); + if (PPE_STATS_NUM_ENTRIES) { + err = npu->ops.ppe_init_stats(npu, ppe->foe_stats_dma, + PPE_STATS_NUM_ENTRIES); + if (err) + goto error_npu_put; + } + + airoha_ppe_hw_init(ppe); + err = airoha_ppe_flush_sram_entries(ppe, npu); if (err) goto error_npu_put; - airoha_ppe_foe_flow_stats_reset(eth->ppe, npu); + airoha_ppe_foe_flow_stats_reset(ppe, npu); rcu_assign_pointer(eth->npu, npu); synchronize_rcu(); diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index 1dc5b4e35ef9..6f66eb339b3f 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -181,6 +181,8 @@ struct airoha_npu { struct { int (*ppe_init)(struct airoha_npu *npu); int (*ppe_deinit)(struct airoha_npu *npu); + int (*ppe_init_stats)(struct airoha_npu *npu, + dma_addr_t addr, u32 num_stats_entries); int (*ppe_flush_sram_entries)(struct airoha_npu *npu, dma_addr_t foe_addr, int sram_num_entries); @@ -206,7 +208,7 @@ struct airoha_npu { }; #if (IS_BUILTIN(CONFIG_NET_AIROHA_NPU) || IS_MODULE(CONFIG_NET_AIROHA_NPU)) -struct airoha_npu *airoha_npu_get(struct device *dev, dma_addr_t *stats_addr); +struct airoha_npu *airoha_npu_get(struct device *dev); void airoha_npu_put(struct airoha_npu *npu); static inline int airoha_npu_wlan_init_reserved_memory(struct airoha_npu *npu) @@ -256,8 +258,7 @@ static inline void airoha_npu_wlan_disable_irq(struct airoha_npu *npu, int q) npu->ops.wlan_disable_irq(npu, q); } #else -static inline struct airoha_npu *airoha_npu_get(struct device *dev, - dma_addr_t *foe_stats_addr) +static inline struct airoha_npu *airoha_npu_get(struct device *dev) { return NULL; } -- cgit v1.2.3 From 87608c2a7718dcac5deef801fb3c18cf36fb0233 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 26 Sep 2025 17:52:39 +0800 Subject: bpf: Remove duplicate crypto/sha2.h header ./include/linux/bpf.h: crypto/sha2.h is included more than once. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=25501 Signed-off-by: Jiapeng Chong Acked-by: Quentin Monnet Link: https://lore.kernel.org/r/20250926095240.3397539-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6338e54a9b1f..fe2a396d8ac6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -32,7 +32,6 @@ #include #include #include -#include struct bpf_verifier_env; struct bpf_verifier_log; -- cgit v1.2.3 From fed7eaa4f037361fe4f3d4170649d6849a25998d Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 25 Sep 2025 12:42:16 -0700 Subject: PM: runtime: Update kerneldoc return codes APIs based on __pm_runtime_idle() (pm_runtime_idle(), pm_request_idle()) do not return 1 when already suspended. They return -EAGAIN. This is already covered in the docs, so the entry for "1" is redundant and conflicting. (pm_runtime_put() and pm_runtime_put_sync() were previously incorrect, but that's fixed in "PM: runtime: pm_runtime_put{,_sync}() returns 1 when already suspended", to ensure consistency with APIs like pm_runtime_put_autosuspend().) RPM_GET_PUT APIs based on __pm_runtime_suspend() do return 1 when already suspended, but the language is a little unclear -- it's not really an "error", so it seems better to list as a clarification before the 0/success case. Additionally, they only actually return 1 when the refcount makes it to 0; if the usage counter is still non-zero, we return 0. pm_runtime_put(), etc., also don't appear at first like they can ever see "-EAGAIN: Runtime PM usage_count non-zero", because in non-racy conditions, pm_runtime_put() would drop its reference count, see it's non-zero, and return early (in __pm_runtime_idle()). However, it's possible to race with another actor that increments the usage_count afterward, since rpm_idle() is protected by a separate lock; in such a case, we may see -EAGAIN. Because this case is only seen in the presence of concurrent actors, it makes sense to clarify that this is when "usage_count **became** non-zero", by way of some racing actor. Lastly, pm_runtime_put_sync_suspend() duplicated some -EAGAIN language. Fix that. Fixes: 271ff96d6066 ("PM: runtime: Document return values of suspend-related API functions") Link: https://lore.kernel.org/linux-pm/aJ5pkEJuixTaybV4@google.com/ Signed-off-by: Brian Norris Reviewed-by: Sakari Ailus Cc: 6.17+ # 6.17+ Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 56 +++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d88d6b6ccf5b..d1ff76e0e2d0 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -350,13 +350,12 @@ static inline int pm_runtime_force_resume(struct device *dev) { return -ENXIO; } * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing - * or device not in %RPM_ACTIVE state. + * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change + * ongoing or device not in %RPM_ACTIVE state. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM idle and suspend callbacks. */ @@ -370,14 +369,15 @@ static inline int pm_runtime_idle(struct device *dev) * @dev: Target device. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -396,14 +396,15 @@ static inline int pm_runtime_suspend(struct device *dev) * engaging its "idle check" callback. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -433,13 +434,12 @@ static inline int pm_runtime_resume(struct device *dev) * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing - * or device not in %RPM_ACTIVE state. + * * -EAGAIN: Runtime PM usage counter non-zero, Runtime PM status change + * ongoing or device not in %RPM_ACTIVE state. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_request_idle(struct device *dev) { @@ -464,15 +464,16 @@ static inline int pm_request_resume(struct device *dev) * equivalent pm_runtime_autosuspend() for @dev asynchronously. * * Return: + * * 1: Success; device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter non-zero or Runtime PM status change + * ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_request_autosuspend(struct device *dev) { @@ -540,15 +541,16 @@ static inline int pm_runtime_resume_and_get(struct device *dev) * equal to 0, queue up a work item for @dev like in pm_request_idle(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_runtime_put(struct device *dev) { @@ -565,15 +567,16 @@ DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T)) * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int __pm_runtime_put_autosuspend(struct device *dev) { @@ -590,15 +593,16 @@ static inline int __pm_runtime_put_autosuspend(struct device *dev) * in pm_request_autosuspend(). * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. */ static inline int pm_runtime_put_autosuspend(struct device *dev) { @@ -619,14 +623,15 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -646,15 +651,15 @@ static inline int pm_runtime_put_sync(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. - * * -EAGAIN: usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ @@ -677,15 +682,16 @@ static inline int pm_runtime_put_sync_suspend(struct device *dev) * if it returns an error code. * * Return: + * * 1: Success. Usage counter dropped to zero, but device was already suspended. * * 0: Success. * * -EINVAL: Runtime PM error. * * -EACCES: Runtime PM disabled. - * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: Runtime PM usage counter became non-zero or Runtime PM status + * change ongoing. * * -EBUSY: Runtime PM child_count non-zero. * * -EPERM: Device PM QoS resume latency 0. * * -EINPROGRESS: Suspend already in progress. * * -ENOSYS: CONFIG_PM not enabled. - * * 1: Device already suspended. * Other values and conditions for the above values are possible as returned by * Runtime PM suspend callbacks. */ -- cgit v1.2.3 From 4540aed51b12bc13364149bf95f6ecef013197c0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Sep 2025 19:12:00 +0200 Subject: bpf: Enforce expected_attach_type for tailcall compatibility Yinhao et al. recently reported: Our fuzzer tool discovered an uninitialized pointer issue in the bpf_prog_test_run_xdp() function within the Linux kernel's BPF subsystem. This leads to a NULL pointer dereference when a BPF program attempts to deference the txq member of struct xdp_buff object. The test initializes two programs of BPF_PROG_TYPE_XDP: progA acts as the entry point for bpf_prog_test_run_xdp() and its expected_attach_type can neither be of be BPF_XDP_DEVMAP nor BPF_XDP_CPUMAP. progA calls into a slot of a tailcall map it owns. progB's expected_attach_type must be BPF_XDP_DEVMAP to pass xdp_is_valid_access() validation. The program returns struct xdp_md's egress_ifindex, and the latter is only allowed to be accessed under mentioned expected_attach_type. progB is then inserted into the tailcall which progA calls. The underlying issue goes beyond XDP though. Another example are programs of type BPF_PROG_TYPE_CGROUP_SOCK_ADDR. sock_addr_is_valid_access() as well as sock_addr_func_proto() have different logic depending on the programs' expected_attach_type. Similarly, a program attached to BPF_CGROUP_INET4_GETPEERNAME should not be allowed doing a tailcall into a program which calls bpf_bind() out of BPF which is only enabled for BPF_CGROUP_INET4_CONNECT. In short, specifying expected_attach_type allows to open up additional functionality or restrictions beyond what the basic bpf_prog_type enables. The use of tailcalls must not violate these constraints. Fix it by enforcing expected_attach_type in __bpf_prog_map_compatible(). Note that we only enforce this for tailcall maps, but not for BPF devmaps or cpumaps: There, the programs are invoked through dev_map_bpf_prog_run*() and cpu_map_bpf_prog_run*() which set up a new environment / context and therefore these situations are not prone to this issue. Fixes: 5e43f899b03a ("bpf: Check attach type at prog load time") Reported-by: Yinhao Hu Reported-by: Kaiyan Mei Reviewed-by: Dongliang Mu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250926171201.188490-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + kernel/bpf/core.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fe2a396d8ac6..a98c83346134 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -289,6 +289,7 @@ struct bpf_map_owner { bool xdp_has_frags; u64 storage_cookie[MAX_BPF_CGROUP_STORAGE_TYPE]; const struct btf_type *attach_func_proto; + enum bpf_attach_type expected_attach_type; }; struct bpf_map { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9b64674df16b..d595fe512498 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2361,6 +2361,7 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, map->owner->type = prog_type; map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->expected_attach_type = fp->expected_attach_type; map->owner->attach_func_proto = aux->attach_func_proto; for_each_cgroup_storage_type(i) { map->owner->storage_cookie[i] = @@ -2372,6 +2373,10 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, ret = map->owner->type == prog_type && map->owner->jited == fp->jited && map->owner->xdp_has_frags == aux->xdp_has_frags; + if (ret && + map->map_type == BPF_MAP_TYPE_PROG_ARRAY && + map->owner->expected_attach_type != fp->expected_attach_type) + ret = false; for_each_cgroup_storage_type(i) { if (!ret) break; -- cgit v1.2.3 From 2f7d98f10b8f64525b2c74cae7d70ae5278eb654 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Jul 2025 15:32:31 -0400 Subject: Have cc(1) catch attempts to modify ->f_path There are very few places that have cause to do that - all in core VFS now, and all done to files that are not yet opened (or visible to anybody else, for that matter). Let's turn f_path into a union of struct path __f_path and const struct path f_path. It's C, not C++ - 6.5.2.3[4] in C99 and later explicitly allows that kind of type-punning. That way any attempts to bypass these checks will be either very easy to catch, or (if the bastards get sufficiently creative to make it hard to spot with grep alone) very clearly malicious - and still catchable with a bit of instrumentation for sparse. Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/file_table.c | 4 ++-- fs/namei.c | 8 ++++---- fs/open.c | 10 +++++----- include/linux/fs.h | 7 ++++++- 4 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/fs/file_table.c b/fs/file_table.c index 85b53e39138d..b223d873e48b 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -171,7 +171,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); - memset(&f->f_path, 0, sizeof(f->f_path)); + memset(&f->__f_path, 0, sizeof(f->f_path)); memset(&f->f_ra, 0, sizeof(f->f_ra)); f->f_flags = flags; @@ -319,7 +319,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) static void file_init_path(struct file *file, const struct path *path, const struct file_operations *fop) { - file->f_path = *path; + file->__f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); diff --git a/fs/namei.c b/fs/namei.c index 3eb0408e3400..ba8bf73d2f9c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3563,8 +3563,8 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; - file->f_path.dentry = DENTRY_NOT_SET; - file->f_path.mnt = nd->path.mnt; + file->__f_path.dentry = DENTRY_NOT_SET; + file->__f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); @@ -3932,8 +3932,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap, child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; - file->f_path.mnt = parentpath->mnt; - file->f_path.dentry = child; + file->__f_path.mnt = parentpath->mnt; + file->__f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); diff --git a/fs/open.c b/fs/open.c index 9655158c3885..f4bdf7693530 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1022,8 +1022,8 @@ cleanup_all: put_file_access(f); cleanup_file: path_put(&f->f_path); - f->f_path.mnt = NULL; - f->f_path.dentry = NULL; + f->__f_path.mnt = NULL; + f->__f_path.dentry = NULL; f->f_inode = NULL; return error; } @@ -1050,7 +1050,7 @@ int finish_open(struct file *file, struct dentry *dentry, { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ - file->f_path.dentry = dentry; + file->__f_path.dentry = dentry; return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); @@ -1071,7 +1071,7 @@ EXPORT_SYMBOL(finish_open); */ int finish_no_open(struct file *file, struct dentry *dentry) { - file->f_path.dentry = dentry; + file->__f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); @@ -1091,7 +1091,7 @@ int vfs_open(const struct path *path, struct file *file) { int ret; - file->f_path = *path; + file->__f_path = *path; ret = do_dentry_open(file, NULL); if (!ret) { /* diff --git a/include/linux/fs.h b/include/linux/fs.h index af514fae4e2d..1fb02c76ae09 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1082,6 +1082,8 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_cred: stashed credentials of creator/opener * @f_owner: file owner * @f_path: path of the file + * @__f_path: writable alias for @f_path; *ONLY* for core VFS and only before + * the file gets open * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position @@ -1107,7 +1109,10 @@ struct file { const struct cred *f_cred; struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ - struct path f_path; + union { + const struct path f_path; + struct path __f_path; + }; union { /* regular files (with FMODE_ATOMIC_POS) and directories */ struct mutex f_pos_lock; -- cgit v1.2.3 From 1ddf1636e0e058adf2231486da0419243eb49539 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 22 Sep 2025 09:06:30 +0300 Subject: net/mlx5: Add IFC bit for TIR/SQ order capability Before this cap, firmware requested a certain creation order between TIR objects and SQs of the same transport domain to properly support the self loopback prevention feature. If order is not preserved, explicit modify_tir operations are necessary after the opening of the SQs. When set, this cap bit indicates that this firmware requirement / limitation no longer holds. Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758521191-814350-2-git-send-email-tariqt@nvidia.com Reviewed-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 0cf187e13def..c0f5fee7a4a5 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1895,7 +1895,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_2a0[0x7]; u8 mkey_pcie_tph[0x1]; - u8 reserved_at_2a8[0x2]; + u8 reserved_at_2a8[0x1]; + u8 tis_tir_td_order[0x1]; u8 psp[0x1]; u8 shampo[0x1]; -- cgit v1.2.3 From 137d1a6355131457723b51a34192320d93d15654 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 22 Sep 2025 09:06:31 +0300 Subject: net/mlx5: IFC add balance ID and LAG per MP group bits Add interface definitions for load balance ID and LAG per multiplane group functionality. This patch introduces the hardware capability bits needed to support balance ID in multiplane LAG configurations. The new fields include: - load_balance_id: 4-bit field for balance identifier. - lag_per_mp_group: capability bit for LAG per multiplane group support. These interface additions are prerequisites for implementing balance ID support in the MLX5 driver. Signed-off-by: Mark Bloch Reviewed-by: Shay Drori Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1758521191-814350-3-git-send-email-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c0f5fee7a4a5..07614cd95bed 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2235,12 +2235,16 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_440[0x8]; u8 max_num_eqs_24b[0x18]; - u8 reserved_at_460[0x160]; + u8 reserved_at_460[0x144]; + u8 load_balance_id[0x4]; + u8 reserved_at_5a8[0x18]; u8 query_adjacent_functions_id[0x1]; u8 ingress_egress_esw_vport_connect[0x1]; u8 function_id_type_vhca_id[0x1]; - u8 reserved_at_5c3[0xd]; + u8 reserved_at_5c3[0x1]; + u8 lag_per_mp_group[0x1]; + u8 reserved_at_5c5[0xb]; u8 delegate_vhca_management_profiles[0x10]; u8 delegated_vhca_max[0x10]; -- cgit v1.2.3 From 2db579838296239545554443234fafb8f485cca0 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Tue, 23 Sep 2025 12:07:06 +0100 Subject: mm/page_vma_mapped: track if the page is mapped across page table boundary Patch series "mm: Improve mlock tracking for large folios", v3. The patchset includes several fixes and improvements related to mlock tracking of large folios. The main objective is to reduce the undercount of Mlocked memory in /proc/meminfo and improve the accuracy of the statistics. Patches 1-2: These patches address a minor race condition in folio_referenced_one() related to mlock_vma_folio(). Currently, mlock_vma_folio() is called on large folio without the page table lock, which can result in a race condition with unmap (i.e. MADV_DONTNEED). This can lead to partially mapped folios on the unevictable LRU list. While not a significant issue, I do not believe backporting is necessary. Patch 3: This patch adds mlocking logic similar to folio_referenced_one() to try_to_unmap_one(), allowing for mlocking of large folios where possible. Patch 4-5: These patches modifies finish_fault() and faultaround to map in the entire folio when possible, enabling efficient mlocking upon addition to the rmap. Patch 6: This patch makes rmap mlock large folios if they are fully mapped, addressing the primary source of mlock undercount for large folios. This patch (of 6): Add a PVMW_PGTABLE_CROSSSED flag that page_vma_mapped_walk() will set if the page is mapped across page table boundary. Unlike other PVMW_* flags, this one is result of page_vma_mapped_walk() and not set by the caller. folio_referenced_one() will use it to detect if it safe to mlock the folio. [akpm@linux-foundation.org: s/CROSSSED/CROSSED/] Link: https://lkml.kernel.org/r/20250923110711.690639-1-kirill@shutemov.name Link: https://lkml.kernel.org/r/20250923110711.690639-2-kirill@shutemov.name Signed-off-by: Kiryl Shutsemau Reviewed-by: Shakeel Butt Cc: Baolin Wang Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/rmap.h | 5 +++++ mm/page_vma_mapped.c | 1 + 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e8aff6d2deda..daa92a58585d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -922,6 +922,11 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, /* Look for migration entries rather than present PTEs */ #define PVMW_MIGRATION (1 << 1) +/* Result flags */ + +/* The page is mapped across page table boundary */ +#define PVMW_PGTABLE_CROSSED (1 << 16) + struct page_vma_mapped_walk { unsigned long pfn; unsigned long nr_pages; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e981a1a292d2..c498a91b6706 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -309,6 +309,7 @@ next_pte: } pte_unmap(pvmw->pte); pvmw->pte = NULL; + pvmw->flags |= PVMW_PGTABLE_CROSSED; goto restart; } pvmw->pte++; -- cgit v1.2.3 From 4d6fc29f36341d7795db1d1819b4c15fe9be7b23 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Wed, 24 Sep 2025 00:16:59 +0530 Subject: mm/ksm: fix incorrect KSM counter handling in mm_struct during fork Patch series "mm/ksm: Fix incorrect accounting of KSM counters during fork", v3. The first patch in this series fixes the incorrect accounting of KSM counters such as ksm_merging_pages, ksm_rmap_items, and the global ksm_zero_pages during fork. The following patch add a selftest to verify the ksm_merging_pages counter was updated correctly during fork. Test Results ============ Without the first patch ----------------------- # [RUN] test_fork_ksm_merging_page_count not ok 10 ksm_merging_page in child: 32 With the first patch -------------------- # [RUN] test_fork_ksm_merging_page_count ok 10 ksm_merging_pages is not inherited after fork This patch (of 2): Currently, the KSM-related counters in `mm_struct`, such as `ksm_merging_pages`, `ksm_rmap_items`, and `ksm_zero_pages`, are inherited by the child process during fork. This results in inconsistent accounting. When a process uses KSM, identical pages are merged and an rmap item is created for each merged page. The `ksm_merging_pages` and `ksm_rmap_items` counters are updated accordingly. However, after a fork, these counters are copied to the child while the corresponding rmap items are not. As a result, when the child later triggers an unmerge, there are no rmap items present in the child, so the counters remain stale, leading to incorrect accounting. A similar issue exists with `ksm_zero_pages`, which maintains both a global counter and a per-process counter. During fork, the per-process counter is inherited by the child, but the global counter is not incremented. Since the child also references zero pages, the global counter should be updated as well. Otherwise, during zero-page unmerge, both the global and per-process counters are decremented, causing the global counter to become inconsistent. To fix this, ksm_merging_pages and ksm_rmap_items are reset to 0 during fork, and the global ksm_zero_pages counter is updated with the per-process ksm_zero_pages value inherited by the child. This ensures that KSM statistics remain accurate and reflect the activity of each process correctly. Link: https://lkml.kernel.org/r/cover.1758648700.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/7b9870eb67ccc0d79593940d9dbd4a0b39b5d396.1758648700.git.donettom@linux.ibm.com Fixes: 7609385337a4 ("ksm: count ksm merging pages for each process") Fixes: cb4df4cae4f2 ("ksm: count allocated ksm rmap_items for each process") Fixes: e2942062e01d ("ksm: count all zero pages placed by KSM") Signed-off-by: Donet Tom Reviewed-by: Chengming Zhou Acked-by: David Hildenbrand Cc: Aboorva Devarajan Cc: David Hildenbrand Cc: Donet Tom Cc: "Ritesh Harjani (IBM)" Cc: Wei Yang Cc: xu xin Cc: [6.6+] Signed-off-by: Andrew Morton --- include/linux/ksm.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 22e67ca7cba3..067538fc4d58 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -56,8 +56,14 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ - if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) + if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) { + long nr_ksm_zero_pages = atomic_long_read(&mm->ksm_zero_pages); + + mm->ksm_merging_pages = 0; + mm->ksm_rmap_items = 0; + atomic_long_add(nr_ksm_zero_pages, &ksm_zero_pages); __ksm_enter(mm); + } } static inline int ksm_execve(struct mm_struct *mm) -- cgit v1.2.3 From 989c2f55ca4839121cbf23b5802f8513dbd54e1e Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 26 Sep 2025 17:24:26 +0800 Subject: mm: silence data-race in update_hiwater_rss KCSAN reports a data race on mm_cluster.hiwater_rss, which can be accessed concurrently from various paths like page migration and memory unmapping without synchronization. Since hiwater_rss is a statistical field for accounting purposes, this data race is benign. Annotate both the read and write accesses with data_race() to make KCSAN happy. Link: https://lkml.kernel.org/r/20250926092426.43312-1-lance.yang@linux.dev Signed-off-by: Lance Yang Reported-by: syzbot+60192c8877d0bc92a92b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/68d6364e.050a0220.3390a8.000d.GAE@google.com Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Marco Elver Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fcb1e72eea40..06978b4dbeb8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2742,7 +2742,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm) unsigned long _rss = get_mm_rss(mm); if (data_race(mm->hiwater_rss) < _rss) - (mm)->hiwater_rss = _rss; + data_race(mm->hiwater_rss = _rss); } static inline void update_hiwater_vm(struct mm_struct *mm) -- cgit v1.2.3 From 81e78b7ec61e89e8bab9736551839f79b063614c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 23 Sep 2025 16:00:58 +0200 Subject: mm: convert folio_page() back to a macro In commit 73b3294b1152 ("mm: simplify folio_page() and folio_page_idx()") we converted folio_page() into a static inline function. However briefly afterwards in commit a847b17009ec ("mm: constify highmem related functions for improved const-correctness") we had to add some nasty const-away casting to make the compiler happy when checking const correctness. So let's just convert it back to a simple macro so the compiler can check const correctness properly. There is the alternative of using a _Generic() similar to page_folio(), but there is not a lot of benefit compared to just using a simple macro. Link: https://lkml.kernel.org/r/20250923140058.2020023-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Kiryl Shutsemau Reviewed-by: SeongJae Park Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Dev Jain Reviewed-by: Suren Baghdasaryan Reviewed-by: Lance Yang Reviewed-by: Wei Yang Cc: Lorenzo Stoakes Cc: "Liam R. Howlett" Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 568011930e35..48e27768e7ba 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -316,10 +316,7 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -static inline struct page *folio_page(const struct folio *folio, unsigned long n) -{ - return (struct page *)(&folio->page + n); -} +#define folio_page(folio, n) (&(folio)->page + (n)) static __always_inline int PageTail(const struct page *page) { -- cgit v1.2.3 From c8a935a31bc787db52296944890f300ba9479088 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Tue, 23 Sep 2025 10:52:29 +0100 Subject: lib/string_choices: Add str_assert_deassert() helper Add str_assert_deassert() helper to return "assert" or "deassert" string literal depending on the boolean argument. Also add the inversed variant str_deassert_assert(). Suggested-by: Philipp Zabel Signed-off-by: Lad Prabhakar Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250923095229.2149740-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Kees Cook --- include/linux/string_choices.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index f3ba4f52ff26..6c4077be7742 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -17,6 +17,12 @@ #include +static inline const char *str_assert_deassert(bool v) +{ + return v ? "assert" : "deassert"; +} +#define str_deassert_assert(v) str_assert_deassert(!(v)) + static inline const char *str_enable_disable(bool v) { return v ? "enable" : "disable"; -- cgit v1.2.3 From 3c1ea5c5019ff197aca7e886a3a240c38f6c6f0d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:47 +0200 Subject: slab: sheaf prefilling for guaranteed allocations Add functions for efficient guaranteed allocations e.g. in a critical section that cannot sleep, when the exact number of allocations is not known beforehand, but an upper limit can be calculated. kmem_cache_prefill_sheaf() returns a sheaf containing at least given number of objects. kmem_cache_alloc_from_sheaf() will allocate an object from the sheaf and is guaranteed not to fail until depleted. kmem_cache_return_sheaf() is for giving the sheaf back to the slab allocator after the critical section. This will also attempt to refill it to cache's sheaf capacity for better efficiency of sheaves handling, but it's not stricly necessary to succeed. kmem_cache_refill_sheaf() can be used to refill a previously obtained sheaf to requested size. If the current size is sufficient, it does nothing. If the requested size exceeds cache's sheaf_capacity and the sheaf's current capacity, the sheaf will be replaced with a new one, hence the indirect pointer parameter. kmem_cache_sheaf_size() can be used to query the current size. The implementation supports requesting sizes that exceed cache's sheaf_capacity, but it is not efficient - such "oversize" sheaves are allocated fresh in kmem_cache_prefill_sheaf() and flushed and freed immediately by kmem_cache_return_sheaf(). kmem_cache_refill_sheaf() might be especially ineffective when replacing a sheaf with a new one of a larger capacity. It is therefore better to size cache's sheaf_capacity accordingly to make oversize sheaves exceptional. CONFIG_SLUB_STATS counters are added for sheaf prefill and return operations. A prefill or return is considered _fast when it is able to grab or return a percpu spare sheaf (even if the sheaf needs a refill to satisfy the request, as those should amortize over time), and _slow otherwise (when the barn or even sheaf allocation/freeing has to be involved). sheaf_prefill_oversize is provided to determine how many prefills were oversize (counter for oversize returns is not necessary as all oversize refills result in oversize returns). When slub_debug is enabled for a cache with sheaves, no percpu sheaves exist for it, but the prefill functionality is still provided simply by all prefilled sheaves becoming oversize. If percpu sheaves are not created for a cache due to not passing the sheaf_capacity argument on cache creation, the prefills also work through oversize sheaves, but there's a WARN_ON_ONCE() to indicate the omission. Reviewed-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 16 ++++ mm/slub.c | 263 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 279 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 49acbcdc6696..680193356ac7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -829,6 +829,22 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size); + +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size); + +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf); + +void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp, + struct slab_sheaf *sheaf) __assume_slab_alignment __malloc; +#define kmem_cache_alloc_from_sheaf(...) \ + alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__)) + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf); + /* * These macros allow declaring a kmem_buckets * parameter alongside size, which * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call diff --git a/mm/slub.c b/mm/slub.c index fec0cdc7ef37..4bd67d5d5ff5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -401,6 +401,11 @@ enum stat_item { BARN_GET_FAIL, /* Failed to get full sheaf from barn */ BARN_PUT, /* Put full sheaf to barn */ BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ + SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */ + SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */ + SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */ + SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */ + SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */ NR_SLUB_STAT_ITEMS }; @@ -462,6 +467,8 @@ struct slab_sheaf { union { struct rcu_head rcu_head; struct list_head barn_list; + /* only used for prefilled sheafs */ + unsigned int capacity; }; struct kmem_cache *cache; unsigned int size; @@ -2838,6 +2845,30 @@ static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf spin_unlock_irqrestore(&barn->lock, flags); } +static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *sheaf = NULL; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&sheaf->barn_list); + barn->nr_full--; + } else if (barn->nr_empty) { + sheaf = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&sheaf->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return sheaf; +} + /* * If a full sheaf is available, return it and put the supplied empty one to * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't @@ -5036,6 +5067,228 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod } EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); +/* + * returns a sheaf that has at least the requested size + * when prefilling is needed, do so with given gfp flags + * + * return NULL if sheaf allocation or prefilling failed + */ +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *sheaf = NULL; + + if (unlikely(size > s->sheaf_capacity)) { + + /* + * slab_debug disables cpu sheaves intentionally so all + * prefilled sheaves become "oversize" and we give up on + * performance for the debugging. Same with SLUB_TINY. + * Creating a cache without sheaves and then requesting a + * prefilled sheaf is however not expected, so warn. + */ + WARN_ON_ONCE(s->sheaf_capacity == 0 && + !IS_ENABLED(CONFIG_SLUB_TINY) && + !(s->flags & SLAB_DEBUG_FLAGS)); + + sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); + if (!sheaf) + return NULL; + + stat(s, SHEAF_PREFILL_OVERSIZE); + sheaf->cache = s; + sheaf->capacity = size; + + if (!__kmem_cache_alloc_bulk(s, gfp, size, + &sheaf->objects[0])) { + kfree(sheaf); + return NULL; + } + + sheaf->size = size; + + return sheaf; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (pcs->spare) { + sheaf = pcs->spare; + pcs->spare = NULL; + stat(s, SHEAF_PREFILL_FAST); + } else { + stat(s, SHEAF_PREFILL_SLOW); + sheaf = barn_get_full_or_empty_sheaf(get_barn(s)); + if (sheaf && sheaf->size) + stat(s, BARN_GET); + else + stat(s, BARN_GET_FAIL); + } + + local_unlock(&s->cpu_sheaves->lock); + + + if (!sheaf) + sheaf = alloc_empty_sheaf(s, gfp); + + if (sheaf && sheaf->size < size) { + if (refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + sheaf = NULL; + } + } + + if (sheaf) + sheaf->capacity = s->sheaf_capacity; + + return sheaf; +} + +/* + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf() + * + * If the sheaf cannot simply become the percpu spare sheaf, but there's space + * for a full sheaf in the barn, we try to refill the sheaf back to the cache's + * sheaf_capacity to avoid handling partially full sheaves. + * + * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the + * sheaf is instead flushed and freed. + */ +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + struct slub_percpu_sheaves *pcs; + struct node_barn *barn; + + if (unlikely(sheaf->capacity != s->sheaf_capacity)) { + sheaf_flush_unused(s, sheaf); + kfree(sheaf); + return; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + barn = get_barn(s); + + if (!pcs->spare) { + pcs->spare = sheaf; + sheaf = NULL; + stat(s, SHEAF_RETURN_FAST); + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!sheaf) + return; + + stat(s, SHEAF_RETURN_SLOW); + + /* + * If the barn has too many full sheaves or we fail to refill the sheaf, + * simply flush and free it. + */ + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES || + refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + return; + } + + barn_put_full_sheaf(barn, sheaf); + stat(s, BARN_PUT); +} + +/* + * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least + * the given size + * + * the sheaf might be replaced by a new one when requesting more than + * s->sheaf_capacity objects if such replacement is necessary, but the refill + * fails (returning -ENOMEM), the existing sheaf is left intact + * + * In practice we always refill to full sheaf's capacity. + */ +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size) +{ + struct slab_sheaf *sheaf; + + /* + * TODO: do we want to support *sheaf == NULL to be equivalent of + * kmem_cache_prefill_sheaf() ? + */ + if (!sheafp || !(*sheafp)) + return -EINVAL; + + sheaf = *sheafp; + if (sheaf->size >= size) + return 0; + + if (likely(sheaf->capacity >= size)) { + if (likely(sheaf->capacity == s->sheaf_capacity)) + return refill_sheaf(s, sheaf, gfp); + + if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, + &sheaf->objects[sheaf->size])) { + return -ENOMEM; + } + sheaf->size = sheaf->capacity; + + return 0; + } + + /* + * We had a regular sized sheaf and need an oversize one, or we had an + * oversize one already but need a larger one now. + * This should be a very rare path so let's not complicate it. + */ + sheaf = kmem_cache_prefill_sheaf(s, gfp, size); + if (!sheaf) + return -ENOMEM; + + kmem_cache_return_sheaf(s, gfp, *sheafp); + *sheafp = sheaf; + return 0; +} + +/* + * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf() + * + * Guaranteed not to fail as many allocations as was the requested size. + * After the sheaf is emptied, it fails - no fallback to the slab cache itself. + * + * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT + * memcg charging is forced over limit if necessary, to avoid failure. + */ +void * +kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + void *ret = NULL; + bool init; + + if (sheaf->size == 0) + goto out; + + ret = sheaf->objects[--sheaf->size]; + + init = slab_want_init_on_alloc(gfp, s); + + /* add __GFP_NOFAIL to force successful memcg charging */ + slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size); +out: + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE); + + return ret; +} + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) +{ + return sheaf->size; +} /* * To avoid unnecessary overhead, we pass through large allocation requests * directly to the page allocator. We use __GFP_COMP, because we will need to @@ -8578,6 +8831,11 @@ STAT_ATTR(BARN_GET, barn_get); STAT_ATTR(BARN_GET_FAIL, barn_get_fail); STAT_ATTR(BARN_PUT, barn_put); STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); +STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast); +STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow); +STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize); +STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast); +STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow); #endif /* CONFIG_SLUB_STATS */ #ifdef CONFIG_KFENCE @@ -8678,6 +8936,11 @@ static struct attribute *slab_attrs[] = { &barn_get_fail_attr.attr, &barn_put_attr.attr, &barn_put_fail_attr.attr, + &sheaf_prefill_fast_attr.attr, + &sheaf_prefill_slow_attr.attr, + &sheaf_prefill_oversize_attr.attr, + &sheaf_return_fast_attr.attr, + &sheaf_return_slow_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, -- cgit v1.2.3 From 9b05890a25d9197e39fcf5b2298f0b911c323306 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 15:00:01 +0200 Subject: maple_tree: Prefilled sheaf conversion and testing Use prefilled sheaves instead of bulk allocations. This should speed up the allocations and the return path of unused allocations. Remove the push and pop of nodes from the maple state as this is now handled by the slab layer with sheaves. Testing has been removed as necessary since the features of the tree have been reduced. Signed-off-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- include/linux/maple_tree.h | 6 +- lib/maple_tree.c | 326 ++++++--------------------- tools/testing/radix-tree/maple.c | 461 ++------------------------------------- tools/testing/shared/linux.c | 5 +- 4 files changed, 88 insertions(+), 710 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index bafe143b1f78..0e31b191e3be 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -442,7 +442,8 @@ struct ma_state { struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ - struct maple_alloc *alloc; /* Allocated nodes for this operation */ + struct slab_sheaf *sheaf; /* Allocated nodes for this operation */ + unsigned long node_request; /* The number of nodes to allocate for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; @@ -490,7 +491,8 @@ struct ma_wr_state { .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ - .alloc = NULL, \ + .node_request = 0, \ + .sheaf = NULL, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0439aaacf6cb..b41245f2cc65 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -182,6 +182,22 @@ static inline void mt_free_bulk(size_t size, void __rcu **nodes) kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } +static void mt_return_sheaf(struct slab_sheaf *sheaf) +{ + kmem_cache_return_sheaf(maple_node_cache, GFP_NOWAIT, sheaf); +} + +static struct slab_sheaf *mt_get_sheaf(gfp_t gfp, int count) +{ + return kmem_cache_prefill_sheaf(maple_node_cache, gfp, count); +} + +static int mt_refill_sheaf(gfp_t gfp, struct slab_sheaf **sheaf, + unsigned int size) +{ + return kmem_cache_refill_sheaf(maple_node_cache, gfp, sheaf, size); +} + /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free @@ -574,67 +590,6 @@ static __always_inline bool mte_dead_node(const struct maple_enode *enode) return ma_dead_node(node); } -/* - * mas_allocated() - Get the number of nodes allocated in a maple state. - * @mas: The maple state - * - * The ma_state alloc member is overloaded to hold a pointer to the first - * allocated node or to the number of requested nodes to allocate. If bit 0 is - * set, then the alloc contains the number of requested nodes. If there is an - * allocated node, then the total allocated nodes is in that node. - * - * Return: The total number of nodes allocated - */ -static inline unsigned long mas_allocated(const struct ma_state *mas) -{ - if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) - return 0; - - return mas->alloc->total; -} - -/* - * mas_set_alloc_req() - Set the requested number of allocations. - * @mas: the maple state - * @count: the number of allocations. - * - * The requested number of allocations is either in the first allocated node, - * located in @mas->alloc->request_count, or directly in @mas->alloc if there is - * no allocated node. Set the request either in the node or do the necessary - * encoding to store in @mas->alloc directly. - */ -static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) -{ - if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { - if (!count) - mas->alloc = NULL; - else - mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); - return; - } - - mas->alloc->request_count = count; -} - -/* - * mas_alloc_req() - get the requested number of allocations. - * @mas: The maple state - * - * The alloc count is either stored directly in @mas, or in - * @mas->alloc->request_count if there is at least one node allocated. Decode - * the request count if it's stored directly in @mas->alloc. - * - * Return: The allocation request count. - */ -static inline unsigned int mas_alloc_req(const struct ma_state *mas) -{ - if ((unsigned long)mas->alloc & 0x1) - return (unsigned long)(mas->alloc) >> 1; - else if (mas->alloc) - return mas->alloc->request_count; - return 0; -} - /* * ma_pivots() - Get a pointer to the maple node pivots. * @node: the maple node @@ -1120,77 +1075,15 @@ static int mas_ascend(struct ma_state *mas) */ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { - struct maple_alloc *ret, *node = mas->alloc; - unsigned long total = mas_allocated(mas); - unsigned int req = mas_alloc_req(mas); + struct maple_node *ret; - /* nothing or a request pending. */ - if (WARN_ON(!total)) + if (WARN_ON_ONCE(!mas->sheaf)) return NULL; - if (total == 1) { - /* single allocation in this ma_state */ - mas->alloc = NULL; - ret = node; - goto single_node; - } - - if (node->node_count == 1) { - /* Single allocation in this node. */ - mas->alloc = node->slot[0]; - mas->alloc->total = node->total - 1; - ret = node; - goto new_head; - } - node->total--; - ret = node->slot[--node->node_count]; - node->slot[node->node_count] = NULL; - -single_node: -new_head: - if (req) { - req++; - mas_set_alloc_req(mas, req); - } - + ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf); memset(ret, 0, sizeof(*ret)); - return (struct maple_node *)ret; -} - -/* - * mas_push_node() - Push a node back on the maple state allocation. - * @mas: The maple state - * @used: The used maple node - * - * Stores the maple node back into @mas->alloc for reuse. Updates allocated and - * requested node count as necessary. - */ -static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) -{ - struct maple_alloc *reuse = (struct maple_alloc *)used; - struct maple_alloc *head = mas->alloc; - unsigned long count; - unsigned int requested = mas_alloc_req(mas); - count = mas_allocated(mas); - - reuse->request_count = 0; - reuse->node_count = 0; - if (count) { - if (head->node_count < MAPLE_ALLOC_SLOTS) { - head->slot[head->node_count++] = reuse; - head->total++; - goto done; - } - reuse->slot[0] = head; - reuse->node_count = 1; - } - - reuse->total = count + 1; - mas->alloc = reuse; -done: - if (requested > 1) - mas_set_alloc_req(mas, requested - 1); + return ret; } /* @@ -1200,75 +1093,32 @@ done: */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { - struct maple_alloc *node; - unsigned long allocated = mas_allocated(mas); - unsigned int requested = mas_alloc_req(mas); - unsigned int count; - void **slots = NULL; - unsigned int max_req = 0; - - if (!requested) - return; + if (unlikely(mas->sheaf)) { + unsigned long refill = mas->node_request; - mas_set_alloc_req(mas, 0); - if (mas->mas_flags & MA_STATE_PREALLOC) { - if (allocated) + if (kmem_cache_sheaf_size(mas->sheaf) >= refill) { + mas->node_request = 0; return; - WARN_ON(!allocated); - } - - if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { - node = (struct maple_alloc *)mt_alloc_one(gfp); - if (!node) - goto nomem_one; - - if (allocated) { - node->slot[0] = mas->alloc; - node->node_count = 1; - } else { - node->node_count = 0; } - mas->alloc = node; - node->total = ++allocated; - node->request_count = 0; - requested--; - } + if (mt_refill_sheaf(gfp, &mas->sheaf, refill)) + goto error; - node = mas->alloc; - while (requested) { - max_req = MAPLE_ALLOC_SLOTS - node->node_count; - slots = (void **)&node->slot[node->node_count]; - max_req = min(requested, max_req); - count = mt_alloc_bulk(gfp, max_req, slots); - if (!count) - goto nomem_bulk; - - if (node->node_count == 0) { - node->slot[0]->node_count = 0; - node->slot[0]->request_count = 0; - } + mas->node_request = 0; + return; + } - node->node_count += count; - allocated += count; - /* find a non-full node*/ - do { - node = node->slot[0]; - } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); - requested -= count; + mas->sheaf = mt_get_sheaf(gfp, mas->node_request); + if (likely(mas->sheaf)) { + mas->node_request = 0; + return; } - mas->alloc->total = allocated; - return; -nomem_bulk: - /* Clean up potential freed allocations on bulk failure */ - memset(slots, 0, max_req * sizeof(unsigned long)); - mas->alloc->total = allocated; -nomem_one: - mas_set_alloc_req(mas, requested); +error: mas_set_err(mas, -ENOMEM); } + /* * mas_free() - Free an encoded maple node * @mas: The maple state @@ -1279,42 +1129,7 @@ nomem_one: */ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { - struct maple_node *tmp = mte_to_node(used); - - if (mt_in_rcu(mas->tree)) - ma_free_rcu(tmp); - else - mas_push_node(mas, tmp); -} - -/* - * mas_node_count_gfp() - Check if enough nodes are allocated and request more - * if there is not enough nodes. - * @mas: The maple state - * @count: The number of nodes needed - * @gfp: the gfp flags - */ -static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) -{ - unsigned long allocated = mas_allocated(mas); - - if (allocated < count) { - mas_set_alloc_req(mas, count - allocated); - mas_alloc_nodes(mas, gfp); - } -} - -/* - * mas_node_count() - Check if enough nodes are allocated and request more if - * there is not enough nodes. - * @mas: The maple state - * @count: The number of nodes needed - * - * Note: Uses GFP_NOWAIT for gfp flags. - */ -static void mas_node_count(struct ma_state *mas, int count) -{ - return mas_node_count_gfp(mas, count, GFP_NOWAIT); + ma_free_rcu(mte_to_node(used)); } /* @@ -2451,10 +2266,7 @@ static inline void mas_topiary_node(struct ma_state *mas, enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); - if (in_rcu) - ma_free_rcu(tmp); - else - mas_push_node(mas, tmp); + ma_free_rcu(tmp); } /* @@ -3980,7 +3792,7 @@ set_content: * * Return: Number of nodes required for preallocation. */ -static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) +static inline void mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) { struct ma_state *mas = wr_mas->mas; unsigned char height = mas_mt_height(mas); @@ -4026,7 +3838,7 @@ static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) WARN_ON_ONCE(1); } - return ret; + mas->node_request = ret; } /* @@ -4087,15 +3899,15 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { - int request; + struct ma_state *mas = wr_mas->mas; mas_wr_prealloc_setup(wr_mas); - wr_mas->mas->store_type = mas_wr_store_type(wr_mas); - request = mas_prealloc_calc(wr_mas, entry); - if (!request) + mas->store_type = mas_wr_store_type(wr_mas); + mas_prealloc_calc(wr_mas, entry); + if (!mas->node_request) return; - mas_node_count(wr_mas->mas, request); + mas_alloc_nodes(mas, GFP_NOWAIT); } /** @@ -5208,7 +5020,6 @@ static inline void mte_destroy_walk(struct maple_enode *enode, */ void *mas_store(struct ma_state *mas, void *entry) { - int request; MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); @@ -5238,11 +5049,11 @@ void *mas_store(struct ma_state *mas, void *entry) return wr_mas.content; } - request = mas_prealloc_calc(&wr_mas, entry); - if (!request) + mas_prealloc_calc(&wr_mas, entry); + if (!mas->node_request) goto store; - mas_node_count(mas, request); + mas_alloc_nodes(mas, GFP_NOWAIT); if (mas_is_err(mas)) return NULL; @@ -5330,20 +5141,19 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); - int ret = 0; - int request; mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); - request = mas_prealloc_calc(&wr_mas, entry); - if (!request) + mas_prealloc_calc(&wr_mas, entry); + if (!mas->node_request) goto set_flag; mas->mas_flags &= ~MA_STATE_PREALLOC; - mas_node_count_gfp(mas, request, gfp); + mas_alloc_nodes(mas, gfp); if (mas_is_err(mas)) { - mas_set_alloc_req(mas, 0); - ret = xa_err(mas->node); + int ret = xa_err(mas->node); + + mas->node_request = 0; mas_destroy(mas); mas_reset(mas); return ret; @@ -5351,7 +5161,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) set_flag: mas->mas_flags |= MA_STATE_PREALLOC; - return ret; + return 0; } EXPORT_SYMBOL_GPL(mas_preallocate); @@ -5365,26 +5175,13 @@ EXPORT_SYMBOL_GPL(mas_preallocate); */ void mas_destroy(struct ma_state *mas) { - struct maple_alloc *node; - unsigned long total; - mas->mas_flags &= ~MA_STATE_PREALLOC; - total = mas_allocated(mas); - while (total) { - node = mas->alloc; - mas->alloc = node->slot[0]; - if (node->node_count > 1) { - size_t count = node->node_count - 1; - - mt_free_bulk(count, (void __rcu **)&node->slot[1]); - total -= count; - } - kfree(ma_mnode_ptr(node)); - total--; - } + mas->node_request = 0; + if (mas->sheaf) + mt_return_sheaf(mas->sheaf); - mas->alloc = NULL; + mas->sheaf = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); @@ -6019,7 +5816,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp) mas_alloc_nodes(mas, gfp); } - if (!mas_allocated(mas)) + if (!mas->sheaf) return false; mas->status = ma_start; @@ -7414,8 +7211,9 @@ void mas_dump(const struct ma_state *mas) pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); - pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", - mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); + pr_err(" min=%lx max=%lx sheaf=" PTR_FMT ", request %lu depth=%u, flags=%x\n", + mas->min, mas->max, mas->sheaf, mas->node_request, mas->depth, + mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); } diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 4a35e1e7c64b..72a8fe8e832a 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -57,430 +57,6 @@ struct rcu_reader_struct { struct rcu_test_struct2 *test; }; -static int get_alloc_node_count(struct ma_state *mas) -{ - int count = 1; - struct maple_alloc *node = mas->alloc; - - if (!node || ((unsigned long)node & 0x1)) - return 0; - while (node->node_count) { - count += node->node_count; - node = node->slot[0]; - } - return count; -} - -static void check_mas_alloc_node_count(struct ma_state *mas) -{ - mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 1, GFP_KERNEL); - mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 3, GFP_KERNEL); - MT_BUG_ON(mas->tree, get_alloc_node_count(mas) != mas->alloc->total); - mas_destroy(mas); -} - -/* - * check_new_node() - Check the creation of new nodes and error path - * verification. - */ -static noinline void __init check_new_node(struct maple_tree *mt) -{ - - struct maple_node *mn, *mn2, *mn3; - struct maple_alloc *smn; - struct maple_node *nodes[100]; - int i, j, total; - - MA_STATE(mas, mt, 0, 0); - - check_mas_alloc_node_count(&mas); - - /* Try allocating 3 nodes */ - mtree_lock(mt); - mt_set_non_kernel(0); - /* request 3 nodes to be allocated. */ - mas_node_count(&mas, 3); - /* Allocation request of 3. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 3); - /* Allocate failed. */ - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, mas.alloc == NULL); - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); - mas_push_node(&mas, mn); - mas_reset(&mas); - mas_destroy(&mas); - mtree_unlock(mt); - - - /* Try allocating 1 node, then 2 more */ - mtree_lock(mt); - /* Set allocation request to 1. */ - mas_set_alloc_req(&mas, 1); - /* Check Allocation request of 1. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); - mas_set_err(&mas, -ENOMEM); - /* Validate allocation request. */ - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - /* Eat the requested node. */ - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, mn->slot[0] != NULL); - MT_BUG_ON(mt, mn->slot[1] != NULL); - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mas.status = ma_start; - mas_destroy(&mas); - /* Allocate 3 nodes, will fail. */ - mas_node_count(&mas, 3); - /* Drop the lock and allocate 3 nodes. */ - mas_nomem(&mas, GFP_KERNEL); - /* Ensure 3 are allocated. */ - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - /* Allocation request of 0. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 0); - - MT_BUG_ON(mt, mas.alloc == NULL); - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); - MT_BUG_ON(mt, mas.alloc->slot[1] == NULL); - /* Ensure we counted 3. */ - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - /* Free. */ - mas_reset(&mas); - mas_destroy(&mas); - - /* Set allocation request to 1. */ - mas_set_alloc_req(&mas, 1); - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); - mas_set_err(&mas, -ENOMEM); - /* Validate allocation request. */ - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_allocated(&mas) != 1); - /* Check the node is only one node. */ - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, mn->slot[0] != NULL); - MT_BUG_ON(mt, mn->slot[1] != NULL); - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != 1); - MT_BUG_ON(mt, mas.alloc->node_count); - - mas_set_alloc_req(&mas, 2); /* request 2 more. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 2); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - MT_BUG_ON(mt, mas.alloc == NULL); - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); - MT_BUG_ON(mt, mas.alloc->slot[1] == NULL); - for (i = 2; i >= 0; i--) { - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != i); - MT_BUG_ON(mt, !mn); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - - total = 64; - mas_set_alloc_req(&mas, total); /* request 2 more. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != total); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - for (i = total; i > 0; i--) { - unsigned int e = 0; /* expected node_count */ - - if (!MAPLE_32BIT) { - if (i >= 35) - e = i - 34; - else if (i >= 5) - e = i - 4; - else if (i >= 2) - e = i - 1; - } else { - if (i >= 4) - e = i - 3; - else if (i >= 1) - e = i - 1; - else - e = 0; - } - - MT_BUG_ON(mt, mas.alloc->node_count != e); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != i - 1); - MT_BUG_ON(mt, !mn); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - - total = 100; - for (i = 1; i < total; i++) { - mas_set_alloc_req(&mas, i); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - for (j = i; j > 0; j--) { - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); - MT_BUG_ON(mt, !mn); - MT_BUG_ON(mt, not_empty(mn)); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != j); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - - mas_set_alloc_req(&mas, i); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - for (j = 0; j <= i/2; j++) { - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); - nodes[j] = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); - } - - while (j) { - j--; - mas_push_node(&mas, nodes[j]); - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); - } - MT_BUG_ON(mt, mas_allocated(&mas) != i); - for (j = 0; j <= i/2; j++) { - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); - } - mas_reset(&mas); - MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); - mas_destroy(&mas); - - } - - /* Set allocation request. */ - total = 500; - mas_node_count(&mas, total); - /* Drop the lock and allocate the nodes. */ - mas_nomem(&mas, GFP_KERNEL); - MT_BUG_ON(mt, !mas.alloc); - i = 1; - smn = mas.alloc; - while (i < total) { - for (j = 0; j < MAPLE_ALLOC_SLOTS; j++) { - i++; - MT_BUG_ON(mt, !smn->slot[j]); - if (i == total) - break; - } - smn = smn->slot[0]; /* next. */ - } - MT_BUG_ON(mt, mas_allocated(&mas) != total); - mas_reset(&mas); - mas_destroy(&mas); /* Free. */ - - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - for (i = 1; i < 128; i++) { - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != i); /* check request filled */ - for (j = i; j > 0; j--) { /*Free the requests */ - mn = mas_pop_node(&mas); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - } - - for (i = 1; i < MAPLE_NODE_MASK + 1; i++) { - MA_STATE(mas2, mt, 0, 0); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != i); /* check request filled */ - for (j = 1; j <= i; j++) { /* Move the allocations to mas2 */ - mn = mas_pop_node(&mas); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - mas_push_node(&mas2, mn); - MT_BUG_ON(mt, mas_allocated(&mas2) != j); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_allocated(&mas2) != i); - - for (j = i; j > 0; j--) { /*Free the requests */ - MT_BUG_ON(mt, mas_allocated(&mas2) != j); - mn = mas_pop_node(&mas2); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas2) != 0); - } - - - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - - mn = mas_pop_node(&mas); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); - - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - - /* Check the limit of pop/push/pop */ - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_alloc_req(&mas)); - MT_BUG_ON(mt, mas.alloc->node_count != 1); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas.alloc->node_count != 1); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - for (i = 1; i <= MAPLE_ALLOC_SLOTS + 1; i++) { - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - - - for (i = 3; i < MAPLE_NODE_MASK * 3; i++) { - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mas_push_node(&mas, mn); /* put it back */ - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn2 = mas_pop_node(&mas); /* get the next node. */ - mas_push_node(&mas, mn); /* put them back */ - mas_push_node(&mas, mn2); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn2 = mas_pop_node(&mas); /* get the next node. */ - mn3 = mas_pop_node(&mas); /* get the next node. */ - mas_push_node(&mas, mn); /* put them back */ - mas_push_node(&mas, mn2); - mas_push_node(&mas, mn3); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mas_destroy(&mas); - } - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, 5); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != 5); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, 10); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != 10); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS - 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS - 1); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, 10 + MAPLE_ALLOC_SLOTS - 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 2); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 2); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 1); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 3 + 2); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 3 + 2); - mas_destroy(&mas); - - mtree_unlock(mt); -} - /* * Check erasing including RCU. */ @@ -35507,6 +35083,13 @@ static unsigned char get_vacant_height(struct ma_wr_state *wr_mas, void *entry) return vacant_height; } +static int mas_allocated(struct ma_state *mas) +{ + if (mas->sheaf) + return kmem_cache_sheaf_size(mas->sheaf); + + return 0; +} /* Preallocation testing */ static noinline void __init check_prealloc(struct maple_tree *mt) { @@ -35525,7 +35108,10 @@ static noinline void __init check_prealloc(struct maple_tree *mt) /* Spanning store */ mas_set_range(&mas, 470, 500); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + + mas_wr_preallocate(&wr_mas, ptr); + MT_BUG_ON(mt, mas.store_type != wr_spanning_store); + MT_BUG_ON(mt, mas_is_err(&mas)); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); vacant_height = get_vacant_height(&wr_mas, ptr); @@ -35535,6 +35121,7 @@ static noinline void __init check_prealloc(struct maple_tree *mt) allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); + mas_wr_preallocate(&wr_mas, ptr); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); @@ -35575,20 +35162,6 @@ static noinline void __init check_prealloc(struct maple_tree *mt) mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); - allocated = mas_allocated(&mas); - height = mas_mt_height(&mas); - vacant_height = get_vacant_height(&wr_mas, ptr); - MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != allocated); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); - mas_destroy(&mas); - allocated = mas_allocated(&mas); - MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); @@ -36389,11 +35962,17 @@ static void check_nomem_writer_race(struct maple_tree *mt) check_load(mt, 6, xa_mk_value(0xC)); mtree_unlock(mt); + mt_set_non_kernel(0); /* test for the same race but with mas_store_gfp() */ mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL); mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL); mas_set_range(&mas, 0, 5); + + /* setup writer 2 that will trigger the race condition */ + mt_set_private(mt); + mt_set_callback(writer2); + mtree_lock(mt); mas_store_gfp(&mas, NULL, GFP_KERNEL); @@ -36508,10 +36087,6 @@ void farmer_tests(void) check_erase_testset(&tree); mtree_destroy(&tree); - mt_init_flags(&tree, 0); - check_new_node(&tree); - mtree_destroy(&tree); - if (!MAPLE_32BIT) { mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_rcu_simulated(&tree); diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c index 4ceff7969b78..8c7257155958 100644 --- a/tools/testing/shared/linux.c +++ b/tools/testing/shared/linux.c @@ -64,7 +64,8 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, if (!(gfp & __GFP_DIRECT_RECLAIM)) { if (!cachep->non_kernel) { - cachep->exec_callback = true; + if (cachep->callback) + cachep->exec_callback = true; return NULL; } @@ -210,6 +211,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, for (i = 0; i < size; i++) __kmem_cache_free_locked(cachep, p[i]); pthread_mutex_unlock(&cachep->lock); + if (cachep->callback) + cachep->exec_callback = true; return 0; } -- cgit v1.2.3 From 6bf377b06c08049d0f4042493df302285e45165e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 15:00:02 +0200 Subject: maple_tree: Add single node allocation support to maple state The fast path through a write will require replacing a single node in the tree. Using a sheaf (32 nodes) is too heavy for the fast path, so special case the node store operation by just allocating one node in the maple state. Signed-off-by: Liam R. Howlett Signed-off-by: Vlastimil Babka --- include/linux/maple_tree.h | 4 +++- lib/maple_tree.c | 47 +++++++++++++++++++++++++++++++++++----- tools/testing/radix-tree/maple.c | 9 ++++++-- 3 files changed, 51 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0e31b191e3be..51a64ff23b88 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -443,6 +443,7 @@ struct ma_state { unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct slab_sheaf *sheaf; /* Allocated nodes for this operation */ + struct maple_node *alloc; /* A single allocated node for fast path writes */ unsigned long node_request; /* The number of nodes to allocate for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ @@ -491,8 +492,9 @@ struct ma_wr_state { .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ - .node_request = 0, \ .sheaf = NULL, \ + .alloc = NULL, \ + .node_request = 0, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b41245f2cc65..2b7299409900 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1073,16 +1073,23 @@ static int mas_ascend(struct ma_state *mas) * * Return: A pointer to a maple node. */ -static inline struct maple_node *mas_pop_node(struct ma_state *mas) +static __always_inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_node *ret; + if (mas->alloc) { + ret = mas->alloc; + mas->alloc = NULL; + goto out; + } + if (WARN_ON_ONCE(!mas->sheaf)) return NULL; ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf); - memset(ret, 0, sizeof(*ret)); +out: + memset(ret, 0, sizeof(*ret)); return ret; } @@ -1093,9 +1100,34 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { - if (unlikely(mas->sheaf)) { - unsigned long refill = mas->node_request; + if (!mas->node_request) + return; + + if (mas->node_request == 1) { + if (mas->sheaf) + goto use_sheaf; + + if (mas->alloc) + return; + mas->alloc = mt_alloc_one(gfp); + if (!mas->alloc) + goto error; + + mas->node_request = 0; + return; + } + +use_sheaf: + if (unlikely(mas->alloc)) { + kfree(mas->alloc); + mas->alloc = NULL; + } + + if (mas->sheaf) { + unsigned long refill; + + refill = mas->node_request; if (kmem_cache_sheaf_size(mas->sheaf) >= refill) { mas->node_request = 0; return; @@ -5180,8 +5212,11 @@ void mas_destroy(struct ma_state *mas) mas->node_request = 0; if (mas->sheaf) mt_return_sheaf(mas->sheaf); - mas->sheaf = NULL; + + if (mas->alloc) + kfree(mas->alloc); + mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); @@ -5816,7 +5851,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp) mas_alloc_nodes(mas, gfp); } - if (!mas->sheaf) + if (!mas->sheaf && !mas->alloc) return false; mas->status = ma_start; diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 72a8fe8e832a..83260f2efb19 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35085,10 +35085,15 @@ static unsigned char get_vacant_height(struct ma_wr_state *wr_mas, void *entry) static int mas_allocated(struct ma_state *mas) { + int total = 0; + + if (mas->alloc) + total++; + if (mas->sheaf) - return kmem_cache_sheaf_size(mas->sheaf); + total += kmem_cache_sheaf_size(mas->sheaf); - return 0; + return total; } /* Preallocation testing */ static noinline void __init check_prealloc(struct maple_tree *mt) -- cgit v1.2.3 From 4957089a23f41f31f8e7e22802a8ef9f5789c191 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:02 -0700 Subject: locking/local_lock: Introduce local_lock_is_locked(). Introduce local_lock_is_locked() that returns true when given local_lock is locked by current cpu (in !PREEMPT_RT) or by current task (in PREEMPT_RT). The goal is to detect a deadlock by the caller. Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Alexei Starovoitov Signed-off-by: Vlastimil Babka --- include/linux/local_lock.h | 2 ++ include/linux/local_lock_internal.h | 7 +++++++ include/linux/rtmutex.h | 10 ++++++++++ kernel/locking/rtmutex_common.h | 9 --------- 4 files changed, 19 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 2ba846419524..0d91d060e3e9 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -66,6 +66,8 @@ */ #define local_trylock(lock) __local_trylock(this_cpu_ptr(lock)) +#define local_lock_is_locked(lock) __local_lock_is_locked(lock) + /** * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable * interrupts if acquired diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 949de37700db..a4dc479157b5 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -165,6 +165,9 @@ do { \ !!tl; \ }) +/* preemption or migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired) + #define __local_lock_release(lock) \ do { \ local_trylock_t *tl; \ @@ -285,4 +288,8 @@ do { \ __local_trylock(lock); \ }) +/* migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(__lock) \ + (rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current) + #endif /* CONFIG_PREEMPT_RT */ diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index fa9f1021541e..ede4c6bf6f22 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -44,6 +44,16 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) return READ_ONCE(lock->owner) != NULL; } +#ifdef CONFIG_RT_MUTEXES +#define RT_MUTEX_HAS_WAITERS 1UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) +{ + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); +} +#endif extern void rt_mutex_base_init(struct rt_mutex_base *rtb); /** diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 78dd3d8c6554..cf6ddd1b23a2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -153,15 +153,6 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) pi_tree.entry); } -#define RT_MUTEX_HAS_WAITERS 1UL - -static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) -{ - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); - - return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); -} - /* * Constants for rt mutex functions which have a selectable deadlock * detection. -- cgit v1.2.3 From 99253de51f80acccc528a9c94e2f4d5f329071f1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:03 -0700 Subject: mm: Allow GFP_ACCOUNT to be used in alloc_pages_nolock(). Change alloc_pages_nolock() to default to __GFP_COMP when allocating pages, since upcoming reentrant alloc_slab_page() needs __GFP_COMP. Also allow __GFP_ACCOUNT flag to be specified, since most of BPF infra needs __GFP_ACCOUNT except BPF streams. Reviewed-by: Vlastimil Babka Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/gfp.h | 2 +- kernel/bpf/stream.c | 2 +- kernel/bpf/syscall.c | 2 +- mm/page_alloc.c | 10 ++++++---- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5ebf26fcdcfa..0ceb4e09306c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -354,7 +354,7 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order); +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index ab592db4a4bf..eb6c5a21c2ef 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -83,7 +83,7 @@ static struct bpf_stream_page *bpf_stream_page_replace(void) struct bpf_stream_page *stream_page, *old_stream_page; struct page *page; - page = alloc_pages_nolock(NUMA_NO_NODE, 0); + page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); if (!page) return NULL; stream_page = page_address(page); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0fbfa8532c39..dbf86f8014de 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -581,7 +581,7 @@ static bool can_alloc_pages(void) static struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) - return alloc_pages_nolock(nid, 0); + return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); return alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1d037f97c5f..30ccff0283fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7480,6 +7480,7 @@ static bool __free_unaccepted(struct page *page) /** * alloc_pages_nolock - opportunistic reentrant allocation from any context + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. * @nid: node to allocate from * @order: allocation order size * @@ -7493,7 +7494,7 @@ static bool __free_unaccepted(struct page *page) * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. * It means ENOMEM. There is no reason to call it again and expect !NULL. */ -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) { /* * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. @@ -7515,12 +7516,13 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) * specify it here to highlight that alloc_pages_nolock() * doesn't want to deplete reserves. */ - gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC - | __GFP_ACCOUNT; + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP + | gfp_flags; unsigned int alloc_flags = ALLOC_TRYLOCK; struct alloc_context ac = { }; struct page *page; + VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT); /* * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is * unsafe in NMI. If spin_trylock() is called from hard IRQ the current @@ -7558,7 +7560,7 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) if (page) set_page_refcounted(page); - if (memcg_kmem_online() && page && + if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) && unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { free_pages_nolock(page, order); page = NULL; -- cgit v1.2.3 From 7612833192d56af86061de8ab51989b75daf5b0d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:06 -0700 Subject: slab: Reuse first bit for OBJEXTS_ALLOC_FAIL Since the combination of valid upper bits in slab->obj_exts with OBJEXTS_ALLOC_FAIL bit can never happen, use OBJEXTS_ALLOC_FAIL == (1ull << 0) as a magic sentinel instead of (1ull << 2) to free up bit 2. Signed-off-by: Alexei Starovoitov Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/memcontrol.h | 10 ++++++++-- mm/slub.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..d254c0b96d0d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -341,17 +341,23 @@ enum page_memcg_data_flags { __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; +#define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ +#define __OBJEXTS_ALLOC_FAIL (1UL << 0) #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { - /* slabobj_ext vector failed to allocate */ - OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, + /* + * Use bit 0 with zero other bits to signal that slabobj_ext vector + * failed to allocate. The same bit 0 with valid upper bits means + * MEMCG_DATA_OBJEXTS. + */ + OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; diff --git a/mm/slub.c b/mm/slub.c index ee575ed9250f..189cd5aa4ac4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2046,7 +2046,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, * objects with no tag reference. Mark all references in this * vector as empty to avoid warnings later on. */ - if (obj_exts & OBJEXTS_ALLOC_FAIL) { + if (obj_exts == OBJEXTS_ALLOC_FAIL) { unsigned int i; for (i = 0; i < objects; i++) -- cgit v1.2.3 From af92793e52c3a99b828ed4bdd277fd3e11c18d08 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:07 -0700 Subject: slab: Introduce kmalloc_nolock() and kfree_nolock(). kmalloc_nolock() relies on ability of local_trylock_t to detect the situation when per-cpu kmem_cache is locked. In !PREEMPT_RT local_(try)lock_irqsave(&s->cpu_slab->lock, flags) disables IRQs and marks s->cpu_slab->lock as acquired. local_lock_is_locked(&s->cpu_slab->lock) returns true when slab is in the middle of manipulating per-cpu cache of that specific kmem_cache. kmalloc_nolock() can be called from any context and can re-enter into ___slab_alloc(): kmalloc() -> ___slab_alloc(cache_A) -> irqsave -> NMI -> bpf -> kmalloc_nolock() -> ___slab_alloc(cache_B) or kmalloc() -> ___slab_alloc(cache_A) -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() -> ___slab_alloc(cache_B) Hence the caller of ___slab_alloc() checks if &s->cpu_slab->lock can be acquired without a deadlock before invoking the function. If that specific per-cpu kmem_cache is busy the kmalloc_nolock() retries in a different kmalloc bucket. The second attempt will likely succeed, since this cpu locked different kmem_cache. Similarly, in PREEMPT_RT local_lock_is_locked() returns true when per-cpu rt_spin_lock is locked by current _task_. In this case re-entrance into the same kmalloc bucket is unsafe, and kmalloc_nolock() tries a different bucket that is most likely is not locked by the current task. Though it may be locked by a different task it's safe to rt_spin_lock() and sleep on it. Similar to alloc_pages_nolock() the kmalloc_nolock() returns NULL immediately if called from hard irq or NMI in PREEMPT_RT. kfree_nolock() defers freeing to irq_work when local_lock_is_locked() and (in_nmi() or in PREEMPT_RT). SLUB_TINY config doesn't use local_lock_is_locked() and relies on spin_trylock_irqsave(&n->list_lock) to allocate, while kfree_nolock() always defers to irq_work. Note, kfree_nolock() must be called _only_ for objects allocated with kmalloc_nolock(). Debug checks (like kmemleak and kfence) were skipped on allocation, hence obj = kmalloc(); kfree_nolock(obj); will miss kmemleak/kfence book keeping and will cause false positives. large_kmalloc is not supported by either kmalloc_nolock() or kfree_nolock(). Signed-off-by: Alexei Starovoitov Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/kasan.h | 13 +- include/linux/memcontrol.h | 2 + include/linux/slab.h | 4 + mm/Kconfig | 1 + mm/kasan/common.c | 5 +- mm/slab.h | 6 + mm/slab_common.c | 3 + mm/slub.c | 504 ++++++++++++++++++++++++++++++++++++++++----- 8 files changed, 483 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 890011071f2b..acdc8cb0152e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -200,7 +200,7 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, } bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, - bool still_accessible); + bool still_accessible, bool no_quarantine); /** * kasan_slab_free - Poison, initialize, and quarantine a slab object. * @object: Object to be freed. @@ -226,11 +226,13 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, * @Return true if KASAN took ownership of the object; false otherwise. */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, - void *object, bool init, - bool still_accessible) + void *object, bool init, + bool still_accessible, + bool no_quarantine) { if (kasan_enabled()) - return __kasan_slab_free(s, object, init, still_accessible); + return __kasan_slab_free(s, object, init, still_accessible, + no_quarantine); return false; } @@ -427,7 +429,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) } static inline bool kasan_slab_free(struct kmem_cache *s, void *object, - bool init, bool still_accessible) + bool init, bool still_accessible, + bool no_quarantine) { return false; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d254c0b96d0d..82563236f35c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -358,6 +358,8 @@ enum objext_flags { * MEMCG_DATA_OBJEXTS. */ OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, + /* slabobj_ext vector allocated with kmalloc_nolock() */ + OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; diff --git a/include/linux/slab.h b/include/linux/slab.h index 680193356ac7..561597dd2164 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -501,6 +501,7 @@ void * __must_check krealloc_noprof(const void *objp, size_t new_size, #define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) void kfree(const void *objp); +void kfree_nolock(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -957,6 +958,9 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f } #define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node); +#define kmalloc_nolock(...) alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__)) + #define kmem_buckets_alloc(_b, _size, _flags) \ alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..202e044f2b4d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -194,6 +194,7 @@ menu "Slab allocator options" config SLUB def_bool y + select IRQ_WORK config KVFREE_RCU_BATCHED def_bool y diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 9142964ab9c9..3264900b942f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -252,7 +252,7 @@ bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, } bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, - bool still_accessible) + bool still_accessible, bool no_quarantine) { if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; @@ -274,6 +274,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, poison_slab_object(cache, object, init); + if (no_quarantine) + return false; + /* * If the object is put into quarantine, do not let slab put the object * onto the freelist for now. The object's metadata is kept until the diff --git a/mm/slab.h b/mm/slab.h index 43245d9207b6..35e533e59b07 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -57,6 +57,10 @@ struct slab { struct { union { struct list_head slab_list; + struct { /* For deferred deactivate_slab() */ + struct llist_node llnode; + void *flush_freelist; + }; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next; @@ -662,6 +666,8 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); +void defer_free_barrier(void); + static inline bool slub_debug_orig_size(struct kmem_cache *s) { return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && diff --git a/mm/slab_common.c b/mm/slab_common.c index b6601e0fe598..932d13ada36c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -510,6 +510,9 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_barrier(); } + /* Wait for deferred work from kmalloc/kfree_nolock() */ + defer_free_barrier(); + cpus_read_lock(); mutex_lock(&slab_mutex); diff --git a/mm/slub.c b/mm/slub.c index 189cd5aa4ac4..f9f7f3942074 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -44,7 +44,8 @@ #include #include #include - +#include +#include #include #include @@ -426,7 +427,7 @@ struct kmem_cache_cpu { #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ #endif - local_lock_t lock; /* Protects the fields above */ + local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif @@ -2079,6 +2080,7 @@ static inline void init_slab_obj_exts(struct slab *slab) int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { + bool allow_spin = gfpflags_allow_spinning(gfp); unsigned int objects = objs_per_slab(s, slab); unsigned long new_exts; unsigned long old_exts; @@ -2087,8 +2089,22 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp &= ~OBJCGS_CLEAR_MASK; /* Prevent recursive extension vector allocation */ gfp |= __GFP_NO_OBJ_EXT; - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, - slab_nid(slab)); + + /* + * Note that allow_spin may be false during early boot and its + * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting + * architectures with cmpxchg16b, early obj_exts will be missing for + * very early allocations on those. + */ + if (unlikely(!allow_spin)) { + size_t sz = objects * sizeof(struct slabobj_ext); + + vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, + slab_nid(slab)); + } else { + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + } if (!vec) { /* Mark vectors which failed to allocate */ if (new_slab) @@ -2098,6 +2114,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, } new_exts = (unsigned long)vec; + if (unlikely(!allow_spin)) + new_exts |= OBJEXTS_NOSPIN_ALLOC; #ifdef CONFIG_MEMCG new_exts |= MEMCG_DATA_OBJEXTS; #endif @@ -2118,7 +2136,10 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, * objcg vector should be reused. */ mark_objexts_empty(vec); - kfree(vec); + if (unlikely(!allow_spin)) + kfree_nolock(vec); + else + kfree(vec); return 0; } @@ -2142,7 +2163,10 @@ static inline void free_slab_obj_exts(struct slab *slab) * the extension for obj_exts is expected to be NULL. */ mark_objexts_empty(obj_exts); - kfree(obj_exts); + if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC)) + kfree_nolock(obj_exts); + else + kfree(obj_exts); slab->obj_exts = 0; } @@ -2476,7 +2500,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init, still_accessible); + return !kasan_slab_free(s, x, init, still_accessible, false); } static __fastpath_inline @@ -2981,13 +3005,17 @@ static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) * Slab allocation and freeing */ static inline struct slab *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) + struct kmem_cache_order_objects oo, + bool allow_spin) { struct folio *folio; struct slab *slab; unsigned int order = oo_order(oo); - if (node == NUMA_NO_NODE) + if (unlikely(!allow_spin)) + folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + node, order); + else if (node == NUMA_NO_NODE) folio = (struct folio *)alloc_frozen_pages(flags, order); else folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); @@ -3137,6 +3165,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order, static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { + bool allow_spin = gfpflags_allow_spinning(flags); struct slab *slab; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; @@ -3156,7 +3185,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; - slab = alloc_slab_page(alloc_gfp, node, oo); + /* + * __GFP_RECLAIM could be cleared on the first allocation attempt, + * so pass allow_spin flag directly. + */ + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; @@ -3164,7 +3197,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - slab = alloc_slab_page(alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) return NULL; stat(s, ORDER_FALLBACK); @@ -3333,33 +3366,47 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); + /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist * and put the slab to the partial (or full) list. */ -static void *alloc_single_from_new_slab(struct kmem_cache *s, - struct slab *slab, int orig_size) +static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, + int orig_size, gfp_t gfpflags) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); int nid = slab_nid(slab); struct kmem_cache_node *n = get_node(s, nid); unsigned long flags; void *object; + if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + slab->frozen = 1; + defer_deactivate_slab(slab, NULL); + return NULL; + } object = slab->freelist; slab->freelist = get_freepointer(s, object); slab->inuse = 1; - if (!alloc_debug_processing(s, slab, object, orig_size)) + if (!alloc_debug_processing(s, slab, object, orig_size)) { /* * It's not really expected that this would fail on a * freshly allocated slab, but a concurrent memory * corruption in theory could cause that. + * Leak memory of allocated slab. */ + if (!allow_spin) + spin_unlock_irqrestore(&n->list_lock, flags); return NULL; + } - spin_lock_irqsave(&n->list_lock, flags); + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); if (slab->inuse == slab->objects) add_full(s, n, slab); @@ -3400,7 +3447,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!n || !n->nr_partial) return NULL; - spin_lock_irqsave(&n->list_lock, flags); + if (gfpflags_allow_spinning(pc->flags)) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { if (!pfmemalloc_match(slab, pc->flags)) continue; @@ -3606,7 +3656,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) lockdep_register_key(&s->lock_key); for_each_possible_cpu(cpu) { c = per_cpu_ptr(s->cpu_slab, cpu); - local_lock_init(&c->lock); + local_trylock_init(&c->lock); if (finegrain_lockdep) lockdep_set_class(&c->lock, &s->lock_key); c->tid = init_tid(cpu); @@ -3699,6 +3749,47 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } } +/* + * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock + * can be acquired without a deadlock before invoking the function. + * + * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is + * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), + * and kmalloc() is not used in an unsupported context. + * + * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). + * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but + * lockdep_assert() will catch a bug in case: + * #1 + * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() + * or + * #2 + * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() + * + * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt + * disabled context. The lock will always be acquired and if needed it + * block and sleep until the lock is available. + * #1 is possible in !PREEMPT_RT only. + * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: + * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> + * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) + * + * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B + */ +#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) +#define local_lock_cpu_slab(s, flags) \ + local_lock_irqsave(&(s)->cpu_slab->lock, flags) +#else +#define local_lock_cpu_slab(s, flags) \ + do { \ + bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ + lockdep_assert(__l); \ + } while (0) +#endif + +#define local_unlock_cpu_slab(s, flags) \ + local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) + #ifdef CONFIG_SLUB_CPU_PARTIAL static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) { @@ -3783,7 +3874,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) unsigned long flags; int slabs = 0; - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); oldslab = this_cpu_read(s->cpu_slab->partial); @@ -3808,7 +3899,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) this_cpu_write(s->cpu_slab->partial, slab); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); if (slab_to_put) { __put_partials(s, slab_to_put); @@ -4323,6 +4414,7 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); void *freelist; struct slab *slab; unsigned long flags; @@ -4348,9 +4440,21 @@ reread_slab: if (unlikely(!node_match(slab, node))) { /* * same as above but node_match() being false already - * implies node != NUMA_NO_NODE + * implies node != NUMA_NO_NODE. + * + * We don't strictly honor pfmemalloc and NUMA preferences + * when !allow_spin because: + * + * 1. Most kmalloc() users allocate objects on the local node, + * so kmalloc_nolock() tries not to interfere with them by + * deactivating the cpu slab. + * + * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause + * unnecessary slab allocations even when n->partial list + * is not empty. */ - if (!node_isset(node, slab_nodes)) { + if (!node_isset(node, slab_nodes) || + !allow_spin) { node = NUMA_NO_NODE; } else { stat(s, ALLOC_NODE_MISMATCH); @@ -4363,13 +4467,14 @@ reread_slab: * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) goto deactivate_slab; /* must check again c->slab in case we got preempted and it changed */ - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); + if (unlikely(slab != c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; @@ -4381,7 +4486,7 @@ reread_slab: if (!freelist) { c->slab = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -4400,34 +4505,34 @@ load_freelist: VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); return freelist; deactivate_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (slab != c->slab) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; c->slab = NULL; c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); deactivate_slab(s, slab, freelist); new_slab: #ifdef CONFIG_SLUB_CPU_PARTIAL while (slub_percpu_partial(c)) { - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } if (unlikely(!slub_percpu_partial(c))) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); /* we were preempted and partial list got empty */ goto new_objects; } @@ -4436,7 +4541,8 @@ new_slab: slub_set_percpu_partial(c, slab); if (likely(node_match(slab, node) && - pfmemalloc_match(slab, gfpflags))) { + pfmemalloc_match(slab, gfpflags)) || + !allow_spin) { c->slab = slab; freelist = get_freelist(s, slab); VM_BUG_ON(!freelist); @@ -4444,7 +4550,7 @@ new_slab: goto load_freelist; } - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); slab->next = NULL; __put_partials(s, slab); @@ -4466,8 +4572,13 @@ new_objects: * allocating new page from other nodes */ if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) - && try_thisnode)) - pc.flags = GFP_NOWAIT | __GFP_THISNODE; + && try_thisnode)) { + if (unlikely(!allow_spin)) + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ + pc.flags = gfpflags | __GFP_THISNODE; + else + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + } pc.orig_size = orig_size; slab = get_partial(s, node, &pc); @@ -4506,7 +4617,7 @@ new_objects: stat(s, ALLOC_SLAB); if (kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab, orig_size); + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); if (unlikely(!freelist)) goto new_objects; @@ -4528,7 +4639,7 @@ new_objects: inc_slabs_node(s, slab_nid(slab), slab->objects); - if (unlikely(!pfmemalloc_match(slab, gfpflags))) { + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. @@ -4539,7 +4650,7 @@ new_objects: retry_load_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { void *flush_freelist = c->freelist; struct slab *flush_slab = c->slab; @@ -4548,9 +4659,14 @@ retry_load_slab: c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); - deactivate_slab(s, flush_slab, flush_freelist); + if (unlikely(!allow_spin)) { + /* Reentrant slub cannot take locks, defer */ + defer_deactivate_slab(flush_slab, flush_freelist); + } else { + deactivate_slab(s, flush_slab, flush_freelist); + } stat(s, CPUSLAB_FLUSH); @@ -4560,6 +4676,19 @@ retry_load_slab: goto load_freelist; } +/* + * We disallow kprobes in ___slab_alloc() to prevent reentrance + * + * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of + * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> + * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() + * manipulating c->freelist without lock. + * + * This does not prevent kprobe in functions called from ___slab_alloc() such as + * local_lock_irqsave() itself, and that is fine, we only need to protect the + * c->freelist manipulation in ___slab_alloc() itself. + */ +NOKPROBE_SYMBOL(___slab_alloc); /* * A wrapper for ___slab_alloc() for contexts where preemption is not yet @@ -4579,8 +4708,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, */ c = slub_get_cpu_ptr(s->cpu_slab); #endif - + if (unlikely(!gfpflags_allow_spinning(gfpflags))) { + if (local_lock_is_locked(&s->cpu_slab->lock)) { + /* + * EBUSY is an internal signal to kmalloc_nolock() to + * retry a different bucket. It's not propagated + * to the caller. + */ + p = ERR_PTR(-EBUSY); + goto out; + } + } p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); +out: #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif @@ -4704,7 +4844,7 @@ static void *__slab_alloc_node(struct kmem_cache *s, return NULL; } - object = alloc_single_from_new_slab(s, slab, orig_size); + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); return object; } @@ -4783,8 +4923,9 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); - kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, init_flags); + if (gfpflags_allow_spinning(flags)) + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); alloc_tagging_slab_alloc_hook(s, p[i], flags); } @@ -5451,6 +5592,96 @@ void *__kmalloc_noprof(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc_noprof); +/** + * kmalloc_nolock - Allocate an object of given size from any context. + * @size: size to allocate + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT + * allowed. + * @node: node number of the target node. + * + * Return: pointer to the new object or NULL in case of error. + * NULL does not mean EBUSY or EAGAIN. It means ENOMEM. + * There is no reason to call it again and expect !NULL. + */ +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) +{ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; + struct kmem_cache *s; + bool can_retry = true; + void *ret = ERR_PTR(-EBUSY); + + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | + __GFP_NO_OBJ_EXT)); + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ + return NULL; +retry: + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; + s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_); + + if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) + /* + * kmalloc_nolock() is not supported on architectures that + * don't implement cmpxchg16b, but debug caches don't use + * per-cpu slab and per-cpu partial slabs. They rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can + * attempt to allocate from debug caches by + * spin_trylock_irqsave(&n->list_lock, ...) + */ + return NULL; + + /* + * Do not call slab_alloc_node(), since trylock mode isn't + * compatible with slab_pre_alloc_hook/should_failslab and + * kfence_alloc. Hence call __slab_alloc_node() (at most twice) + * and slab_post_alloc_hook() directly. + * + * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair + * in irq saved region. It assumes that the same cpu will not + * __update_cpu_freelist_fast() into the same (freelist,tid) pair. + * Therefore use in_nmi() to check whether particular bucket is in + * irq protected section. + * + * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that + * this cpu was interrupted somewhere inside ___slab_alloc() after + * it did local_lock_irqsave(&s->cpu_slab->lock, flags). + * In this case fast path with __update_cpu_freelist_fast() is not safe. + */ +#ifndef CONFIG_SLUB_TINY + if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) +#endif + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + + if (PTR_ERR(ret) == -EBUSY) { + if (can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; + } + ret = NULL; + } + + maybe_wipe_obj_freeptr(s, ret); + slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, + slab_want_init_on_alloc(alloc_gfp, s), size); + + ret = kasan_kmalloc(s, ret, size, alloc_gfp); + return ret; +} +EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof); + void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node, unsigned long caller) { @@ -6108,6 +6339,93 @@ flush_remote: } } +struct defer_free { + struct llist_head objects; + struct llist_head slabs; + struct irq_work work; +}; + +static void free_deferred_objects(struct irq_work *work); + +static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { + .objects = LLIST_HEAD_INIT(objects), + .slabs = LLIST_HEAD_INIT(slabs), + .work = IRQ_WORK_INIT(free_deferred_objects), +}; + +/* + * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe + * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). + */ +static void free_deferred_objects(struct irq_work *work) +{ + struct defer_free *df = container_of(work, struct defer_free, work); + struct llist_head *objs = &df->objects; + struct llist_head *slabs = &df->slabs; + struct llist_node *llnode, *pos, *t; + + if (llist_empty(objs) && llist_empty(slabs)) + return; + + llnode = llist_del_all(objs); + llist_for_each_safe(pos, t, llnode) { + struct kmem_cache *s; + struct slab *slab; + void *x = pos; + + slab = virt_to_slab(x); + s = slab->slab_cache; + + /* + * We used freepointer in 'x' to link 'x' into df->objects. + * Clear it to NULL to avoid false positive detection + * of "Freepointer corruption". + */ + *(void **)x = NULL; + + /* Point 'x' back to the beginning of allocated object */ + x -= s->offset; + __slab_free(s, slab, x, x, 1, _THIS_IP_); + } + + llnode = llist_del_all(slabs); + llist_for_each_safe(pos, t, llnode) { + struct slab *slab = container_of(pos, struct slab, llnode); + +#ifdef CONFIG_SLUB_TINY + discard_slab(slab->slab_cache, slab); +#else + deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); +#endif + } +} + +static void defer_free(struct kmem_cache *s, void *head) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + if (llist_add(head + s->offset, &df->objects)) + irq_work_queue(&df->work); +} + +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + slab->flush_freelist = flush_freelist; + if (llist_add(&slab->llnode, &df->slabs)) + irq_work_queue(&df->work); +} + +void defer_free_barrier(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); +} + #ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that @@ -6128,6 +6446,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { + /* cnt == 0 signals that it's called from kfree_nolock() */ + bool allow_spin = cnt; struct kmem_cache_cpu *c; unsigned long tid; void **freelist; @@ -6146,10 +6466,29 @@ redo: barrier(); if (unlikely(slab != c->slab)) { - __slab_free(s, slab, head, tail, cnt, addr); + if (unlikely(!allow_spin)) { + /* + * __slab_free() can locklessly cmpxchg16 into a slab, + * but then it might need to take spin_lock or local_lock + * in put_cpu_partial() for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, head); + } else { + __slab_free(s, slab, head, tail, cnt, addr); + } return; } + if (unlikely(!allow_spin)) { + if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && + local_lock_is_locked(&s->cpu_slab->lock)) { + defer_free(s, head); + return; + } + cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ + } + if (USE_LOCKLESS_FAST_PATH()) { freelist = READ_ONCE(c->freelist); @@ -6160,11 +6499,13 @@ redo: goto redo; } } else { + __maybe_unused unsigned long flags = 0; + /* Update the free list under the local lock */ - local_lock(&s->cpu_slab->lock); + local_lock_cpu_slab(s, flags); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); goto redo; } tid = c->tid; @@ -6174,7 +6515,7 @@ redo: c->freelist = head; c->tid = next_tid(tid); - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); } stat_add(s, FREE_FASTPATH, cnt); } @@ -6405,6 +6746,71 @@ void kfree(const void *object) } EXPORT_SYMBOL(kfree); +/* + * Can be called while holding raw_spinlock_t or from IRQ and NMI, + * but ONLY for objects allocated by kmalloc_nolock(). + * Debug checks (like kmemleak and kfence) were skipped on allocation, + * hence + * obj = kmalloc(); kfree_nolock(obj); + * will miss kmemleak/kfence book keeping and will cause false positives. + * large_kmalloc is not supported either. + */ +void kfree_nolock(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + memcg_slab_free_hook(s, slab, &x, 1); + alloc_tagging_slab_free_hook(s, slab, &x, 1); + /* + * Unlike slab_free() do NOT call the following: + * kmemleak_free_recursive(x, s->flags); + * debug_check_no_locks_freed(x, s->object_size); + * debug_check_no_obj_freed(x, s->object_size); + * __kcsan_check_access(x, s->object_size, ..); + * kfence_free(x); + * since they take spinlocks or not safe from any context. + */ + kmsan_slab_free(s, x); + /* + * If KASAN finds a kernel bug it will do kasan_report_invalid_free() + * which will call raw_spin_lock_irqsave() which is technically + * unsafe from NMI, but take chance and report kernel bug. + * The sequence of + * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI + * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU + * is double buggy and deserves to deadlock. + */ + if (kasan_slab_pre_free(s, x)) + return; + /* + * memcg, kasan_slab_pre_free are done for 'x'. + * The only thing left is kasan_poison without quarantine, + * since kasan quarantine takes locks and not supported from NMI. + */ + kasan_slab_free(s, x, false, false, /* skip quarantine */true); +#ifndef CONFIG_SLUB_TINY + do_slab_free(s, slab, x, x, 0, _RET_IP_); +#else + defer_free(s, x); +#endif +} +EXPORT_SYMBOL_GPL(kfree_nolock); + static __always_inline __realloc_size(2) void * __do_krealloc(const void *p, size_t new_size, gfp_t flags) { -- cgit v1.2.3 From 9a0abc39450a3123fd52533a662fbd37e0d1508c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 17:47:14 +0200 Subject: PM: runtime: Add auto-cleanup macros for "resume and get" operations It is generally useful to be able to automatically drop a device's runtime PM usage counter incremented by runtime PM operations that resume a device and bump up its usage counter [1]. To that end, add guard definition macros allowing pm_runtime_put() and pm_runtime_put_autosuspend() to be used for the auto-cleanup in those cases. Simply put, a piece of code like below: pm_runtime_get_sync(dev); ..... pm_runtime_put(dev); return 0; can be transformed with guard() like: guard(pm_runtime_active)(dev); ..... return 0; (see the pm_runtime_put() call is gone). However, it is better to do proper error handling in the majority of cases, so doing something like this instead of the above is recommended: ACQUIRE(pm_runtime_active_try, pm)(dev); if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) return -ENXIO; ..... return 0; In all of the cases in which runtime PM is known to be enabled for the given device or the device can be regarded as operational (and so it can be accessed) with runtime PM disabled, a piece of code like: ret = pm_runtime_resume_and_get(dev); if (ret < 0) return ret; ..... pm_runtime_put(dev); return 0; can be changed as follows: ACQUIRE(pm_runtime_active_try, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_try, &pm); if (ret < 0) return ret; ..... return 0; (again, see the pm_runtime_put() call is gone). Still, if the device cannot be accessed unless runtime PM has been enabled for it, the pm_runtime_active_try_enabled guard variant needs to be used, that is (in the context of the example above): ACQUIRE(pm_runtime_active_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; When the original code calls pm_runtime_put_autosuspend(), use one of the "auto" guard variants, pm_runtime_active_auto/_try/_enabled, so for example, a piece of code like: ret = pm_runtime_resume_and_get(dev); if (ret < 0) return ret; ..... pm_runtime_put_autosuspend(dev); return 0; will become: ACQUIRE(pm_runtime_active_auto_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_auto_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; Note that the cases in which the return value of pm_runtime_get_sync() is checked can also be handled with the help of the new guard macros. For example, a piece of code like: ret = pm_runtime_get_sync(dev); if (ret < 0) { pm_runtime_put(dev); return ret; } ..... pm_runtime_put(dev); return 0; can be rewritten as: ACQUIRE(pm_runtime_active_auto_try_enabled, pm)(dev); ret = ACQUIRE_ERR(pm_runtime_active_auto_try_enabled, &pm); if (ret < 0) return ret; ..... return 0; or pm_runtime_get_active_try can be used if transparent handling of disabled runtime PM is desirable. Link: https://lore.kernel.org/linux-pm/878qimv24u.wl-tiwai@suse.de/ [1] Link: https://lore.kernel.org/linux-pm/20250926150613.000073a4@huawei.com/ Signed-off-by: Rafael J. Wysocki Acked-by: Dan Williams Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/2238241.irdbgypaU6@rafael.j.wysocki [ rjw: Fixed leftovers from the previous version in the changelog ] Reviewed-by: Jonathan Cameron Reviewed-by: Dhruva Gole Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 2 ++ include/linux/pm_runtime.h | 44 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 37 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index faa68bf9ef3d..0fc825066ede 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -799,6 +799,8 @@ static int rpm_resume(struct device *dev, int rpmflags) if (dev->power.runtime_status == RPM_ACTIVE && dev->power.last_status == RPM_ACTIVE) retval = 1; + else if (rpmflags & RPM_TRANSPARENT) + goto out; else retval = -EACCES; } diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d1ff76e0e2d0..e5426bdd0c9f 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -21,6 +21,7 @@ #define RPM_GET_PUT 0x04 /* Increment/decrement the usage_count */ #define RPM_AUTO 0x08 /* Use autosuspend_delay */ +#define RPM_TRANSPARENT 0x10 /* Succeed if runtime PM is disabled */ /* * Use this for defining a set of PM operations to be used in all situations @@ -512,6 +513,19 @@ static inline int pm_runtime_get_sync(struct device *dev) return __pm_runtime_resume(dev, RPM_GET_PUT); } +static inline int pm_runtime_get_active(struct device *dev, int rpmflags) +{ + int ret; + + ret = __pm_runtime_resume(dev, RPM_GET_PUT | rpmflags); + if (ret < 0) { + pm_runtime_put_noidle(dev); + return ret; + } + + return 0; +} + /** * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. * @dev: Target device. @@ -522,15 +536,7 @@ static inline int pm_runtime_get_sync(struct device *dev) */ static inline int pm_runtime_resume_and_get(struct device *dev) { - int ret; - - ret = __pm_runtime_resume(dev, RPM_GET_PUT); - if (ret < 0) { - pm_runtime_put_noidle(dev); - return ret; - } - - return 0; + return pm_runtime_get_active(dev, 0); } /** @@ -610,6 +616,26 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) return __pm_runtime_put_autosuspend(dev); } +DEFINE_GUARD(pm_runtime_active, struct device *, + pm_runtime_get_sync(_T), pm_runtime_put(_T)); +DEFINE_GUARD(pm_runtime_active_auto, struct device *, + pm_runtime_get_sync(_T), pm_runtime_put_autosuspend(_T)); +/* + * Use the following guards with ACQUIRE()/ACQUIRE_ERR(). + * + * The difference between the "_try" and "_try_enabled" variants is that the + * former do not produce an error when runtime PM is disabled for the given + * device. + */ +DEFINE_GUARD_COND(pm_runtime_active, _try, + pm_runtime_get_active(_T, RPM_TRANSPARENT)) +DEFINE_GUARD_COND(pm_runtime_active, _try_enabled, + pm_runtime_resume_and_get(_T)) +DEFINE_GUARD_COND(pm_runtime_active_auto, _try, + pm_runtime_get_active(_T, RPM_TRANSPARENT)) +DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, + pm_runtime_resume_and_get(_T)) + /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. -- cgit v1.2.3 From d5e58ce1fb0f13a9a0845851f267ede3551cd9fe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 18:26:40 +0200 Subject: PM: runtime: Drop DEFINE_FREE() for pm_runtime_put() The DEFINE_FREE() for pm_runtime_put has been superseded by recently introduced runtime PM auto-cleanup macros and its only user has been converted to using one of the new macros, so drop it. Signed-off-by: Rafael J. Wysocki Reviewed-by: Dhruva Gole Reviewed-by: Takashi Iwai Reviewed-by: Jonathan Cameron --- include/linux/pm_runtime.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index e5426bdd0c9f..edb8aed5ef62 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -563,8 +563,6 @@ static inline int pm_runtime_put(struct device *dev) return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } -DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T)) - /** * __pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. -- cgit v1.2.3 From c39d6d4d933381714b6e5d735545256558ec6c05 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sat, 27 Sep 2025 08:29:35 -0400 Subject: ptr_ring: __ptr_ring_zero_tail micro optimization __ptr_ring_zero_tail currently does the - 1 operation twice: - during initialization of head - at each loop iteration Let's just do it in one place, all we need to do is adjust the loop condition. this is better: - a slightly clearer logic with less duplication - uses prefix -- we don't need to save the old value - one less - 1 operation - for example, when ring is empty we now don't do - 1 at all, existing code does it once Text size shrinks from 15081 to 15050 bytes. Signed-off-by: Michael S. Tsirkin Reviewed-by: Simon Horman Link: https://patch.msgid.link/bcd630c7edc628e20d4f8e037341f26c90ab4365.1758976026.git.mst@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/ptr_ring.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index a736b16859a6..534531807d95 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -248,15 +248,15 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r) */ static inline void __ptr_ring_zero_tail(struct ptr_ring *r, int consumer_head) { - int head = consumer_head - 1; + int head = consumer_head; /* Zero out entries in the reverse order: this way we touch the * cache line that producer might currently be reading the last; * producer won't make progress and touch other cache lines * besides the first one until we write out all entries. */ - while (likely(head >= r->consumer_tail)) - r->queue[head--] = NULL; + while (likely(head > r->consumer_tail)) + r->queue[--head] = NULL; r->consumer_tail = consumer_head; } -- cgit v1.2.3 From e28d5a68b6519ec6b2118a3f604295b5534eeb51 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Sat, 27 Sep 2025 10:49:11 +0200 Subject: dpll: add phase_offset_avg_factor_get/set callback ops Add new callback operations for a dpll device: - phase_offset_avg_factor_get(...) - to obtain current phase offset averaging factor from dpll device, - phase_offset_avg_factor_set(...) - to set phase offset averaging factor Obtain the factor value using the get callback and provide it to the user if the device driver implement this callback. Execute the set callback upon user requests, if the driver implement it. Signed-off-by: Ivan Vecera v2: * do not require 'set' callback to retrieve current value * always call 'set' callback regardless of current value Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250927084912.2343597-3-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- drivers/dpll/dpll_netlink.c | 66 ++++++++++++++++++++++++++++++++++++++++----- include/linux/dpll.h | 6 +++++ 2 files changed, 65 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 0a852011653c..74c1f0ca95f2 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -164,6 +164,27 @@ dpll_msg_add_phase_offset_monitor(struct sk_buff *msg, struct dpll_device *dpll, return 0; } +static int +dpll_msg_add_phase_offset_avg_factor(struct sk_buff *msg, + struct dpll_device *dpll, + struct netlink_ext_ack *extack) +{ + const struct dpll_device_ops *ops = dpll_device_ops(dpll); + u32 factor; + int ret; + + if (ops->phase_offset_avg_factor_get) { + ret = ops->phase_offset_avg_factor_get(dpll, dpll_priv(dpll), + &factor, extack); + if (ret) + return ret; + if (nla_put_u32(msg, DPLL_A_PHASE_OFFSET_AVG_FACTOR, factor)) + return -EMSGSIZE; + } + + return 0; +} + static int dpll_msg_add_lock_status(struct sk_buff *msg, struct dpll_device *dpll, struct netlink_ext_ack *extack) @@ -675,6 +696,9 @@ dpll_device_get_one(struct dpll_device *dpll, struct sk_buff *msg, if (nla_put_u32(msg, DPLL_A_TYPE, dpll->type)) return -EMSGSIZE; ret = dpll_msg_add_phase_offset_monitor(msg, dpll, extack); + if (ret) + return ret; + ret = dpll_msg_add_phase_offset_avg_factor(msg, dpll, extack); if (ret) return ret; @@ -839,6 +863,23 @@ dpll_phase_offset_monitor_set(struct dpll_device *dpll, struct nlattr *a, extack); } +static int +dpll_phase_offset_avg_factor_set(struct dpll_device *dpll, struct nlattr *a, + struct netlink_ext_ack *extack) +{ + const struct dpll_device_ops *ops = dpll_device_ops(dpll); + u32 factor = nla_get_u32(a); + + if (!ops->phase_offset_avg_factor_set) { + NL_SET_ERR_MSG_ATTR(extack, a, + "device not capable of changing phase offset average factor"); + return -EOPNOTSUPP; + } + + return ops->phase_offset_avg_factor_set(dpll, dpll_priv(dpll), factor, + extack); +} + static int dpll_pin_freq_set(struct dpll_pin *pin, struct nlattr *a, struct netlink_ext_ack *extack) @@ -1736,14 +1777,25 @@ int dpll_nl_device_get_doit(struct sk_buff *skb, struct genl_info *info) static int dpll_set_from_nlattr(struct dpll_device *dpll, struct genl_info *info) { - int ret; - - if (info->attrs[DPLL_A_PHASE_OFFSET_MONITOR]) { - struct nlattr *a = info->attrs[DPLL_A_PHASE_OFFSET_MONITOR]; + struct nlattr *a; + int rem, ret; - ret = dpll_phase_offset_monitor_set(dpll, a, info->extack); - if (ret) - return ret; + nla_for_each_attr(a, genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + switch (nla_type(a)) { + case DPLL_A_PHASE_OFFSET_MONITOR: + ret = dpll_phase_offset_monitor_set(dpll, a, + info->extack); + if (ret) + return ret; + break; + case DPLL_A_PHASE_OFFSET_AVG_FACTOR: + ret = dpll_phase_offset_avg_factor_set(dpll, a, + info->extack); + if (ret) + return ret; + break; + } } return 0; diff --git a/include/linux/dpll.h b/include/linux/dpll.h index fa1e76920d0e..25be745bf41f 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -38,6 +38,12 @@ struct dpll_device_ops { void *dpll_priv, enum dpll_feature_state *state, struct netlink_ext_ack *extack); + int (*phase_offset_avg_factor_set)(const struct dpll_device *dpll, + void *dpll_priv, u32 factor, + struct netlink_ext_ack *extack); + int (*phase_offset_avg_factor_get)(const struct dpll_device *dpll, + void *dpll_priv, u32 *factor, + struct netlink_ext_ack *extack); }; struct dpll_pin_ops { -- cgit v1.2.3 From 7bd80ed89d72285515db673803b021469ba71ee8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 24 Sep 2025 14:02:41 +0200 Subject: Documentation: net: add flow control guide and document ethtool API Introduce a new document, flow_control.rst, to provide a comprehensive guide on Ethernet Flow Control in Linux. The guide explains how flow control works, how autonegotiation resolves pause capabilities, and how to configure it using ethtool and Netlink. In parallel, document the pause and pause-stat attributes in the ethtool.yaml netlink spec. This enables the ynl tool to generate kernel-doc comments for the corresponding enums in the UAPI header, making the C interface self-documenting. Finally, replace the legacy flow control section in phy.rst with a reference to the new document and add pointers in the relevant C source files. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20250924120241.724850-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 ++ Documentation/networking/flow_control.rst | 373 +++++++++++++++++++++++++ Documentation/networking/index.rst | 1 + Documentation/networking/phy.rst | 12 +- include/linux/ethtool.h | 45 ++- include/uapi/linux/ethtool_netlink_generated.h | 4 +- net/dcb/dcbnl.c | 2 + net/ethtool/pause.c | 4 + 8 files changed, 453 insertions(+), 15 deletions(-) create mode 100644 Documentation/networking/flow_control.rst (limited to 'include/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 6a0fb1974513..e4852505294f 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -864,7 +864,9 @@ attribute-sets: - name: pause-stat + doc: Statistics counters for link-wide PAUSE frames (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-stat-cnt + enum-name: ethtool-a-pause-stat attributes: - name: unspec @@ -875,13 +877,17 @@ attribute-sets: type: pad - name: tx-frames + doc: Number of PAUSE frames transmitted. type: u64 - name: rx-frames + doc: Number of PAUSE frames received. type: u64 - name: pause + doc: Parameters for link-wide PAUSE (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-cnt + enum-name: ethtool-a-pause attributes: - name: unspec @@ -893,19 +899,40 @@ attribute-sets: nested-attributes: header - name: autoneg + doc: | + Acts as a mode selector for the driver. + On GET: indicates the driver's behavior. If true, the driver will + respect the negotiated outcome; if false, the driver will use a + forced configuration. + On SET: if true, the driver configures the PHY's advertisement based + on the rx and tx attributes. If false, the driver forces the MAC + into the state defined by the rx and tx attributes. type: u8 - name: rx + doc: | + Enable receiving PAUSE frames (pausing local TX). + On GET: reflects the currently preferred configuration state. type: u8 - name: tx + doc: | + Enable transmitting PAUSE frames (pausing peer TX). + On GET: reflects the currently preferred configuration state. type: u8 - name: stats + doc: | + Contains the pause statistics counters. The source of these + statistics is determined by stats-src. type: nest nested-attributes: pause-stat - name: stats-src + doc: | + Selects the source of the MAC statistics, values from + enum ethtool_mac_stats_src. This allows requesting statistics + from the individual components of the MAC Merge layer. type: u32 - name: eee diff --git a/Documentation/networking/flow_control.rst b/Documentation/networking/flow_control.rst new file mode 100644 index 000000000000..48646d54513f --- /dev/null +++ b/Documentation/networking/flow_control.rst @@ -0,0 +1,373 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _ethernet-flow-control: + +===================== +Ethernet Flow Control +===================== + +This document is a practical guide to Ethernet Flow Control in Linux, covering +what it is, how it works, and how to configure it. + +What is Flow Control? +===================== + +Flow control is a mechanism to prevent a fast sender from overwhelming a +slow receiver with data, which would cause buffer overruns and dropped packets. +The receiver can signal the sender to temporarily stop transmitting, giving it +time to process its backlog. + +Standards references +==================== + +Ethernet flow control mechanisms are specified across consolidated IEEE base +standards; some originated as amendments: + +- Collision-based flow control is part of CSMA/CD in **IEEE 802.3** + (half-duplex). +- Link-wide PAUSE is defined in **IEEE 802.3 Annex 31B** + (originally **802.3x**). +- Priority-based Flow Control (PFC) is defined in **IEEE 802.1Q Clause 36** + (originally **802.1Qbb**). + +In the remainder of this document, the consolidated clause numbers are used. + +How It Works: The Mechanisms +============================ + +The method used for flow control depends on the link's duplex mode. + +.. note:: + The user-visible ``ethtool`` pause API described in this document controls + **link-wide PAUSE** (IEEE 802.3 Annex 31B) only. It does not control the + collision-based behavior that exists on half-duplex links. + +1. Half-Duplex: Collision-Based Flow Control +-------------------------------------------- +On half-duplex links, a device cannot send and receive simultaneously, so PAUSE +frames are not used. Flow control is achieved by leveraging the CSMA/CD +(Carrier Sense Multiple Access with Collision Detection) protocol itself. + +* **How it works**: To inhibit incoming data, a receiving device can force a + collision on the line. When the sending station detects this collision, it + terminates its transmission, sends a "jam" signal, and then executes the + "Collision backoff and retransmission" procedure as defined in IEEE 802.3, + Section 4.2.3.2.5. This algorithm makes the sender wait for a random + period before attempting to retransmit. By repeatedly forcing collisions, + the receiver can effectively throttle the sender's transmission rate. + +.. note:: + While this mechanism is part of the IEEE standard, there is currently no + generic kernel API to configure or control it. Drivers should not enable + this feature until a standardized interface is available. + +.. warning:: + On shared-medium networks (e.g. 10BASE2, or twisted-pair networks using a + hub rather than a switch) forcing collisions inhibits traffic **across the + entire shared segment**, not just a single point-to-point link. Enabling + such behavior is generally undesirable. + +2. Full-Duplex: Link-wide PAUSE (IEEE 802.3 Annex 31B) +------------------------------------------------------ +On full-duplex links, devices can send and receive at the same time. Flow +control is achieved by sending a special **PAUSE frame**, defined by IEEE +802.3 Annex 31B. This mechanism pauses all traffic on the link and is therefore +called *link-wide PAUSE*. + +* **What it is**: A standard Ethernet frame with a globally reserved + destination MAC address (``01-80-C2-00-00-01``). This address is in a range + that standard IEEE 802.1D-compliant bridges do not forward. However, some + unmanaged or misconfigured bridges have been reported to forward these + frames, which can disrupt flow control across a network. + +* **How it works**: The frame contains a MAC Control opcode for PAUSE + (``0x0001``) and a ``pause_time`` value, telling the sender how long to + wait before sending more data frames. This time is specified in units of + "pause quantum", where one quantum is the time it takes to transmit 512 bits. + For example, one pause quantum is 51.2 microseconds on a 10 Mbit/s link, + and 512 nanoseconds on a 1 Gbit/s link. A ``pause_time`` of zero indicates + that the transmitter can resume transmission, even if a previous non-zero + pause time has not yet elapsed. + +* **Who uses it**: Any full-duplex link, from 10 Mbit/s to multi-gigabit speeds. + +3. Full-Duplex: Priority-based Flow Control (PFC) (IEEE 802.1Q Clause 36) +------------------------------------------------------------------------- +Priority-based Flow Control is an enhancement to the standard PAUSE mechanism +that allows flow control to be applied independently to different classes of +traffic, identified by their priority level. + +* **What it is**: PFC allows a receiver to pause traffic for one or more of the + 8 standard priority levels without stopping traffic for other priorities. + This is critical in data center environments for protocols that cannot + tolerate packet loss due to congestion (e.g., Fibre Channel over Ethernet + or RoCE). + +* **How it works**: PFC uses a specific PAUSE frame format. It shares the same + globally reserved destination MAC address (``01-80-C2-00-00-01``) as legacy + PAUSE frames but uses a unique opcode (``0x0101``). The frame payload + contains two key fields: + + - **``priority_enable_vector``**: An 8-bit mask where each bit corresponds to + one of the 8 priorities. If a bit is set to 1, it means the pause time + for that priority is active. + - **``time_vector``**: A list of eight 2-octet fields, one for each priority. + Each field specifies the ``pause_time`` for its corresponding priority, + measured in units of ``pause_quanta`` (the time to transmit 512 bits). + +.. note:: + When PFC is enabled for at least one priority on a port, the standard + **link-wide PAUSE** (IEEE 802.3 Annex 31B) must be disabled for that port. + The two mechanisms are mutually exclusive (IEEE 802.1Q Clause 36). + +Configuring Flow Control +======================== + +Link-wide PAUSE and Priority-based Flow Control are configured with different +tools. + +Configuring Link-wide PAUSE with ``ethtool`` (IEEE 802.3 Annex 31B) +------------------------------------------------------------------- +Use ``ethtool -a `` to view and ``ethtool -A `` to change +the link-wide PAUSE settings. + +.. code-block:: bash + + # View current link-wide PAUSE settings + ethtool -a eth0 + + # Enable RX and TX pause, with autonegotiation + ethtool -A eth0 autoneg on rx on tx on + +**Key Configuration Concepts**: + +* **Pause Autoneg vs Generic Autoneg**: ``ethtool -A ... autoneg {on,off}`` + controls **Pause Autoneg** (Annex 31B) only. It is independent from the + **Generic link autonegotiation** configured with ``ethtool -s``. A device can + have Generic autoneg **on** while Pause Autoneg is **off**, and vice versa. + +* **If Pause Autoneg is off** (``-A ... autoneg off``): the device will **not** + advertise pause in the PHY. The MAC PAUSE state is **forced** according to + ``rx``/``tx`` and does not depend on partner capabilities or resolution. + Ensure the peer is configured complementarily for PAUSE to be effective. + +* **If generic autoneg is off** but **Pause Autoneg is on**, the pause policy + is **remembered** by the kernel and applied later when Generic autoneg is + enabled again. + +* **Autonegotiation Mode**: The PHY will *advertise* the ``rx`` and ``tx`` + capabilities. The final active state is determined by what both sides of the + link agree on. See the "PHY (Physical Layer Transceiver)" section below, + especially the *Resolution* subsection, for details of the negotiation rules. + +* **Forced Mode**: This mode is necessary when autonegotiation is not used or + not possible. This includes links where one or both partners have + autonegotiation disabled, or in setups without a PHY (e.g., direct + MAC-to-MAC connections). The driver bypasses PHY advertisement and + directly forces the MAC into the specified ``rx``/``tx`` state. The + configuration on both sides of the link must be complementary. For + example, if one side is set to ``tx on`` ``rx off``, the link partner must be + set to ``tx off`` ``rx on`` for flow control to function correctly. + +Configuring PFC with ``dcb`` (IEEE 802.1Q Clause 36) +---------------------------------------------------- +PFC is part of the Data Center Bridging (DCB) subsystem and is managed with the +``dcb`` tool (iproute2). Some deployments use ``dcbtool`` (lldpad) instead; this +document shows ``dcb(8)`` examples. + +**Viewing PFC Settings**: + +.. code-block:: text + + $ dcb pfc show dev eth0 + pfc-cap 8 macsec-bypass off delay 4096 + prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on + +This shows the PFC state (on/off) for each priority (0-7). + +**Changing PFC Settings**: + +.. code-block:: bash + + # Enable PFC on priorities 6 and 7, leaving others as they are + $ dcb pfc set dev eth0 prio-pfc 6:on 7:on + + # Disable PFC for all priorities except 6 and 7 + $ dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on + +Monitoring Flow Control +======================= + +The standard way to check if flow control is actively being used is to view the +pause-related statistics. + +**Monitoring Link-wide PAUSE**: +Use ``ethtool --include-statistics -a ``. + +.. code-block:: text + + $ ethtool --include-statistics -a eth0 + Pause parameters for eth0: + ... + Statistics: + tx_pause_frames: 0 + rx_pause_frames: 0 + +**Monitoring PFC**: +PFC statistics (sent and received frames per priority) are available +through the ``dcb`` tool. + +.. code-block:: text + + $ dcb pfc show dev eth0 requests indications + requests 0:0 1:0 2:0 3:1024 4:2048 5:0 6:0 7:0 + indications 0:0 1:0 2:0 3:512 4:4096 5:0 6:0 7:0 + +The ``requests`` counters track transmitted PFC frames (TX), and the +``indications`` counters track received PFC frames (RX). + +Link-wide PAUSE Autonegotiation Details +======================================= + +The autonegotiation process for link-wide PAUSE is managed by the PHY and +involves advertising capabilities and resolving the outcome. + +* Terminology (link-wide PAUSE): + + - **Symmetric pause**: both directions are paused when requested (TX+RX + enabled). + - **Asymmetric pause**: only one direction is paused (e.g., RX-only or + TX-only). + + In IEEE 802.3 advertisement/resolution, symmetric/asymmetric are encoded + using two bits (Pause/Asym) and resolved per the standard truth tables + below. + +* **Advertisement**: The PHY advertises the MAC's flow control capabilities. + This is done using two bits in the advertisement register: "Symmetric + Pause" (Pause) and "Asymmetric Pause" (Asym). These bits should be + interpreted as a combined value, not as independent flags. The kernel + converts the user's ``rx`` and ``tx`` settings into this two-bit value as + follows: + + .. code-block:: text + + tx rx | Pause Asym + -------+------------- + 0 0 | 0 0 + 0 1 | 1 1 + 1 0 | 0 1 + 1 1 | 1 0 + +* **Resolution**: After negotiation, the PHY reports the link partner's + advertised Pause and Asym bits. The final flow control mode is determined + by the combination of the local and partner advertisements, according to + the IEEE 802.3 standard: + + .. code-block:: text + + Local Device | Link Partner | Result + Pause Asym | Pause Asym | + -------------------+--------------------+--------- + 0 X | 0 X | Disabled + 0 1 | 1 0 | Disabled + 0 1 | 1 1 | TX only + 1 0 | 0 X | Disabled + 1 X | 1 X | TX + RX + 1 1 | 0 1 | RX only + + It is important to note that the advertised bits reflect the *current + configuration* of the MAC, which may not represent its full hardware + capabilities. + +Kernel Policy: "Set and Trust" +============================== + +The ethtool pause API is defined as a **wish policy** for +IEEE 802.3 link-wide PAUSE only. A user request is always accepted +as the preferred configuration, but it may not be possible to apply +it in all link states. + +Key constraints: + +- Link-wide PAUSE is not valid on half-duplex links. +- Link-wide PAUSE cannot be used together with Priority-based Flow Control + (PFC, IEEE 802.1Q Clause 36). +- If autonegotiation is active and the link is currently down, the future + mode is not yet known. + +Because of these constraints, the kernel stores the requested setting +and applies it only when the link is in a compatible state. + +Implications for userspace: + +1. Set once (the "wish"): the requested Rx/Tx PAUSE policy is + remembered even if it cannot be applied immediately. +2. Applied conditionally: when the link comes up, the kernel enables + PAUSE only if the active mode allows it. + +Component Roles in Flow Control +=============================== + +The configuration of flow control involves several components, each with a +distinct role. + +The MAC (Media Access Controller) +--------------------------------- +The MAC is the hardware component that actually sends and receives PAUSE +frames. Its capabilities define the upper limit of what the driver can support. +For link-wide PAUSE, MACs can vary in their support for symmetric (both +directions) or asymmetric (independent TX/RX) flow control. + +For PFC, the MAC must be capable of generating and interpreting the +priority-based PAUSE frames and managing separate pause states for each +traffic class. + +Many MACs also implement automatic PAUSE frame transmission based on the fill +level of their internal RX FIFO. This is typically configured with two +thresholds: + +* **FLOW_ON (High Water Mark)**: When the RX FIFO usage reaches this + threshold, the MAC automatically transmits a PAUSE frame to stop the sender. + +* **FLOW_OFF (Low Water Mark)**: When the RX FIFO usage drops below this + threshold, the MAC transmits a PAUSE frame with a quantum of zero to tell + the sender it can resume transmission. + +The PHY (Physical Layer Transceiver) +------------------------------------ +The PHY's role is distinct for each flow control mechanism: + +* **Link-wide PAUSE**: During the autonegotiation process, the PHY is + responsible for advertising the device's flow control capabilities. See the + "Link-wide PAUSE Autonegotiation Details" section for more information. + +* **Half-Duplex Collision-Based Flow Control**: The PHY is fundamental to the + CSMA/CD process. It performs carrier sensing (checking if the line is idle) + and collision detection, which is the mechanism leveraged to throttle the + sender. + +* **Priority-based Flow Control (PFC)**: The PHY is not directly involved in + negotiating PFC capabilities. Its role is to establish the physical link. + PFC negotiation happens at a higher layer via the Data Center Bridging + Capability Exchange Protocol (DCBX). + +User Space Interface +==================== +The primary user space tools are ``ethtool`` for link-wide PAUSE and ``dcb`` for +PFC. They communicate with the kernel to configure the network device driver +and underlying hardware. + +**Link-wide PAUSE Netlink Interface (``ethtool``)** + +See the ethtool Netlink spec (``Documentation/netlink/specs/ethtool.yaml``) +for the authoritative definition of the Pause control and Pause statistics +attributes. The generated UAPI is in +``include/uapi/linux/ethtool_netlink_generated.h``. + +**PFC Netlink Interface (``dcb``)** + +The authoritative definitions for DCB/PFC netlink attributes and commands are in +``include/uapi/linux/dcbnl.h``. See also the ``dcb(8)`` manual page and the DCB +subsystem documentation for userspace configuration details. + diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c775cababc8c..52aafdc85f6a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,6 +55,7 @@ Contents: eql fib_trie filter + flow_control generic-hdlc generic_netlink ../netlink/specs/index diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index b0f2ef83735d..40cc0a988d60 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -343,16 +343,8 @@ Some of the interface modes are described below: Pause frames / flow control =========================== -The PHY does not participate directly in flow control/pause frames except by -making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in -MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC -controller supports such a thing. Since flow control/pause frames generation -involves the Ethernet MAC driver, it is recommended that this driver takes care -of properly indicating advertisement and support for such features by setting -the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done -either before or after phy_connect() and/or as a result of implementing the -ethtool::set_pauseparam feature. - +For detailed link-wide PAUSE and PFC behavior and configuration, see +flow_control.rst. Keeping Close Tabs on the PAL ============================= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c2d8b4ec62eb..eeed1ea50369 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -953,9 +953,48 @@ struct kernel_ethtool_ts_info { * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. - * @get_pauseparam: Report pause parameters - * @set_pauseparam: Set pause parameters. Returns a negative error code - * or zero. + * + * @get_pauseparam: Report the configured policy for link-wide PAUSE + * (IEEE 802.3 Annex 31B). Drivers must fill struct ethtool_pauseparam + * such that: + * @autoneg: + * This refers to **Pause Autoneg** (IEEE 802.3 Annex 31B) only + * and is independent of generic link autonegotiation configured + * via ethtool -s. + * true -> the device follows the negotiated result of pause + * autonegotiation (Pause/Asym); + * false -> the device uses a forced MAC state independent of + * negotiation. + * @rx_pause/@tx_pause: + * represent the desired policy (preferred configuration). + * In autoneg mode they describe what is to be advertised; + * in forced mode they describe the MAC state to apply. + * + * Drivers (and/or frameworks) should persist this policy across link + * changes and reapply appropriate MAC programming when link parameters + * change. + * + * @set_pauseparam: Apply a policy for link-wide PAUSE (IEEE 802.3 Annex 31B). + * If @autoneg is true: + * Arrange for pause advertisement (Pause/Asym) based on + * @rx_pause/@tx_pause and program the MAC to follow the + * negotiated result (which may be symmetric, asymmetric, or off + * depending on the link partner). + * If @autoneg is false: + * Do not rely on autonegotiation; force the MAC RX/TX pause + * state directly per @rx_pause/@tx_pause. + * + * Implementations that integrate with PHYLIB/PHYLINK should cooperate + * with those frameworks for advertisement and resolution; MAC drivers are + * still responsible for applying the required MAC state. + * + * Return: 0 on success or a negative errno. Return -EOPNOTSUPP if + * link-wide PAUSE is unsupported. If only symmetric pause is supported, + * reject unsupported asymmetric requests with -EINVAL (or document any + * coercion policy). + * + * See also: Documentation/networking/flow_control.rst + * * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 0e8ac0d974e2..3dd9d7cde86e 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -375,7 +375,7 @@ enum { ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -enum { +enum ethtool_a_pause_stat { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, @@ -385,7 +385,7 @@ enum { ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; -enum { +enum ethtool_a_pause { ETHTOOL_A_PAUSE_UNSPEC, ETHTOOL_A_PAUSE_HEADER, ETHTOOL_A_PAUSE_AUTONEG, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 03eb1d941fca..91ee22f53774 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -27,6 +27,8 @@ * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. + * See Documentation/networking/flow_control.rst for a high level description + * of the user space interface for Priority-based Flow Control (PFC). * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 0f9af1e66548..eacf6a4859bf 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only +/* See Documentation/networking/flow_control.rst for a high level description of + * the userspace interface. + */ + #include "netlink.h" #include "common.h" -- cgit v1.2.3 From e211c463b748c4e2e8364c10bc216ca775fcc943 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 27 Sep 2025 21:52:30 +0200 Subject: net: phy: stop exporting phy_driver_unregister After 42e2a9e11a1d ("net: phy: dp83640: improve phydev and driver removal handling") we can stop exporting also phy_driver_unregister(). Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/2bab950e-4b70-4030-b997-03f48379586f@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/phy/phy_device.c | 11 +++++------ include/linux/phy.h | 1 - 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 01269b865f5e..1c02640412f7 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3589,6 +3589,11 @@ static int phy_driver_register(struct phy_driver *new_driver, return 0; } +static void phy_driver_unregister(struct phy_driver *drv) +{ + driver_unregister(&drv->mdiodrv.driver); +} + int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner) { @@ -3606,12 +3611,6 @@ int phy_drivers_register(struct phy_driver *new_driver, int n, } EXPORT_SYMBOL(phy_drivers_register); -void phy_driver_unregister(struct phy_driver *drv) -{ - driver_unregister(&drv->mdiodrv.driver); -} -EXPORT_SYMBOL(phy_driver_unregister); - void phy_drivers_unregister(struct phy_driver *drv, int n) { int i; diff --git a/include/linux/phy.h b/include/linux/phy.h index b377dfaa6801..7a54a8b4d277 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2030,7 +2030,6 @@ static inline int phy_read_status(struct phy_device *phydev) return genphy_read_status(phydev); } -void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); -- cgit v1.2.3 From 9c94ae6bb0b2895024b6e29fcc1cbec968b4776a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:32 +0000 Subject: net: make softnet_data.defer_count an atomic This is preparation work to remove the softnet_data.defer_lock, as it is contended on hosts with large number of cores. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 +- net/core/dev.c | 2 +- net/core/skbuff.c | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1b85454116f6..27e3fa69253f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3538,7 +3538,7 @@ struct softnet_data { /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; - int defer_count; + atomic_t defer_count; int defer_ipi_scheduled; struct sk_buff *defer_list; call_single_data_t defer_csd; diff --git a/net/core/dev.c b/net/core/dev.c index 8b54fdf0289a..8566678d8344 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6726,7 +6726,7 @@ static void skb_defer_free_flush(struct softnet_data *sd) spin_lock(&sd->defer_lock); skb = sd->defer_list; sd->defer_list = NULL; - sd->defer_count = 0; + atomic_set(&sd->defer_count, 0); spin_unlock(&sd->defer_lock); while (skb != NULL) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 618afd59afff..16cd357d62a6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7202,14 +7202,12 @@ nodefer: kfree_skb_napi_cache(skb); sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - if (READ_ONCE(sd->defer_count) >= defer_max) + if (atomic_read(&sd->defer_count) >= defer_max) goto nodefer; spin_lock_bh(&sd->defer_lock); /* Send an IPI every time queue reaches half capacity. */ - kick = sd->defer_count == (defer_max >> 1); - /* Paired with the READ_ONCE() few lines above */ - WRITE_ONCE(sd->defer_count, sd->defer_count + 1); + kick = (atomic_inc_return(&sd->defer_count) - 1) == (defer_max >> 1); skb->next = sd->defer_list; /* Paired with READ_ONCE() in skb_defer_free_flush() */ -- cgit v1.2.3 From 844c9db7f7f5fe1b0b53ed9f1c2bc7313b3021c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:33 +0000 Subject: net: use llist for sd->defer_list Get rid of sd->defer_lock and adopt llist operations. We optimize skb_attempt_defer_free() for the common case, where the packet is queued. Otherwise sd->defer_count is increasing, until skb_defer_free_flush() clears it. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-3-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 8 ++++---- net/core/dev.c | 18 ++++++------------ net/core/skbuff.c | 15 +++++++-------- 3 files changed, 17 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 27e3fa69253f..5c9aa16933d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3537,10 +3537,10 @@ struct softnet_data { struct numa_drop_counters drop_counters; /* Another possibly contended cache line */ - spinlock_t defer_lock ____cacheline_aligned_in_smp; - atomic_t defer_count; - int defer_ipi_scheduled; - struct sk_buff *defer_list; + struct llist_head defer_list ____cacheline_aligned_in_smp; + atomic_long_t defer_count; + + int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; diff --git a/net/core/dev.c b/net/core/dev.c index 8566678d8344..fb67372774de 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6717,22 +6717,16 @@ EXPORT_SYMBOL(napi_complete_done); static void skb_defer_free_flush(struct softnet_data *sd) { + struct llist_node *free_list; struct sk_buff *skb, *next; - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) + if (llist_empty(&sd->defer_list)) return; + atomic_long_set(&sd->defer_count, 0); + free_list = llist_del_all(&sd->defer_list); - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - atomic_set(&sd->defer_count, 0); - spin_unlock(&sd->defer_lock); - - while (skb != NULL) { - next = skb->next; + llist_for_each_entry_safe(skb, next, free_list, ll_node) { napi_consume_skb(skb, 1); - skb = next; } } @@ -12995,7 +12989,7 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - spin_lock_init(&sd->defer_lock); + init_llist_head(&sd->defer_list); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 16cd357d62a6..17455fc1e692 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7185,6 +7185,7 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + unsigned long defer_count; int cpu = skb->alloc_cpu; struct softnet_data *sd; unsigned int defer_max; @@ -7202,17 +7203,15 @@ nodefer: kfree_skb_napi_cache(skb); sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - if (atomic_read(&sd->defer_count) >= defer_max) + defer_count = atomic_long_inc_return(&sd->defer_count); + + if (defer_count >= defer_max) goto nodefer; - spin_lock_bh(&sd->defer_lock); - /* Send an IPI every time queue reaches half capacity. */ - kick = (atomic_inc_return(&sd->defer_count) - 1) == (defer_max >> 1); + llist_add(&skb->ll_node, &sd->defer_list); - skb->next = sd->defer_list; - /* Paired with READ_ONCE() in skb_defer_free_flush() */ - WRITE_ONCE(sd->defer_list, skb); - spin_unlock_bh(&sd->defer_lock); + /* Send an IPI every time queue reaches half capacity. */ + kick = (defer_count - 1) == (defer_max >> 1); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). -- cgit v1.2.3 From 5628f3fe3b16114e8424bbfcf0594caef8958a06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 28 Sep 2025 08:49:34 +0000 Subject: net: add NUMA awareness to skb_attempt_defer_free() Instead of sharing sd->defer_list & sd->defer_count with many cpus, add one pair for each NUMA node. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250928084934.3266948-4-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 4 ---- include/net/hotdata.h | 7 +++++++ net/core/dev.c | 35 +++++++++++++++++++++++------------ net/core/dev.h | 2 +- net/core/skbuff.c | 11 ++++++----- 5 files changed, 37 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5c9aa16933d1..d1a687444b27 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3536,10 +3536,6 @@ struct softnet_data { struct numa_drop_counters drop_counters; - /* Another possibly contended cache line */ - struct llist_head defer_list ____cacheline_aligned_in_smp; - atomic_long_t defer_count; - int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; diff --git a/include/net/hotdata.h b/include/net/hotdata.h index fda94b2647ff..4acec191c54a 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -2,10 +2,16 @@ #ifndef _NET_HOTDATA_H #define _NET_HOTDATA_H +#include #include #include #include +struct skb_defer_node { + struct llist_head defer_list; + atomic_long_t defer_count; +} ____cacheline_aligned_in_smp; + /* Read mostly data used in network fast paths. */ struct net_hotdata { #if IS_ENABLED(CONFIG_INET) @@ -30,6 +36,7 @@ struct net_hotdata { struct rps_sock_flow_table __rcu *rps_sock_flow_table; u32 rps_cpu_mask; #endif + struct skb_defer_node __percpu *skb_defer_nodes; int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; diff --git a/net/core/dev.c b/net/core/dev.c index fb67372774de..a64cef2c537e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5180,8 +5180,9 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +void kick_defer_list_purge(unsigned int cpu) { + struct softnet_data *sd = &per_cpu(softnet_data, cpu); unsigned long flags; if (use_backlog_threads()) { @@ -6715,18 +6716,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -static void skb_defer_free_flush(struct softnet_data *sd) +static void skb_defer_free_flush(void) { struct llist_node *free_list; struct sk_buff *skb, *next; + struct skb_defer_node *sdn; + int node; - if (llist_empty(&sd->defer_list)) - return; - atomic_long_set(&sd->defer_count, 0); - free_list = llist_del_all(&sd->defer_list); + for_each_node(node) { + sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node; + + if (llist_empty(&sdn->defer_list)) + continue; + atomic_long_set(&sdn->defer_count, 0); + free_list = llist_del_all(&sdn->defer_list); - llist_for_each_entry_safe(skb, next, free_list, ll_node) { - napi_consume_skb(skb, 1); + llist_for_each_entry_safe(skb, next, free_list, ll_node) { + napi_consume_skb(skb, 1); + } } } @@ -6854,7 +6861,7 @@ count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); - skb_defer_free_flush(this_cpu_ptr(&softnet_data)); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7713,7 +7720,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) local_irq_disable(); net_rps_action_and_irq_enable(sd); } - skb_defer_free_flush(sd); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7755,7 +7762,7 @@ start: for (;;) { struct napi_struct *n; - skb_defer_free_flush(sd); + skb_defer_free_flush(); if (list_empty(&list)) { if (list_empty(&repoll)) { @@ -12989,7 +12996,6 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - init_llist_head(&sd->defer_list); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; @@ -12999,6 +13005,11 @@ static int __init net_dev_init(void) if (net_page_pool_create(i)) goto out; } + net_hotdata.skb_defer_nodes = + __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids, + __alignof__(struct skb_defer_node)); + if (!net_hotdata.skb_defer_nodes) + goto out; if (use_backlog_threads()) smpboot_register_percpu_thread(&backlog_threads); diff --git a/net/core/dev.h b/net/core/dev.h index d6b08d435479..900880e8b5b4 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -357,7 +357,7 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi) WARN_ON(READ_ONCE(napi->list_owner) != -1); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); +void kick_defer_list_purge(unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 17455fc1e692..bc12790017b0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -7185,9 +7185,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + struct skb_defer_node *sdn; unsigned long defer_count; int cpu = skb->alloc_cpu; - struct softnet_data *sd; unsigned int defer_max; bool kick; @@ -7201,14 +7201,15 @@ nodefer: kfree_skb_napi_cache(skb); DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(skb->destructor); - sd = &per_cpu(softnet_data, cpu); + sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - defer_count = atomic_long_inc_return(&sd->defer_count); + defer_count = atomic_long_inc_return(&sdn->defer_count); if (defer_count >= defer_max) goto nodefer; - llist_add(&skb->ll_node, &sd->defer_list); + llist_add(&skb->ll_node, &sdn->defer_list); /* Send an IPI every time queue reaches half capacity. */ kick = (defer_count - 1) == (defer_max >> 1); @@ -7217,7 +7218,7 @@ nodefer: kfree_skb_napi_cache(skb); * if we are unlucky enough (this seems very unlikely). */ if (unlikely(kick)) - kick_defer_list_purge(sd, cpu); + kick_defer_list_purge(cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, -- cgit v1.2.3 From 20c48920583675e67b3824f147726e0fbda735ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 18 Sep 2025 17:33:00 -0700 Subject: KVM: Export KVM-internal symbols for sub-modules only Rework the vast majority of KVM's exports to expose symbols only to KVM submodules, i.e. to x86's kvm-{amd,intel}.ko and PPC's kvm-{pr,hv}.ko. With few exceptions, KVM's exported APIs are intended (and safe) for KVM- internal usage only. Keep kvm_get_kvm(), kvm_get_kvm_safe(), and kvm_put_kvm() as normal exports, as they are needed by VFIO, and are generally safe for external usage (though ideally even the get/put APIs would be KVM-internal, and VFIO would pin a VM by grabbing a reference to its associated file). Implement a framework in kvm_types.h in anticipation of providing a macro to restrict KVM-specific kernel exports, i.e. to provide symbol exports for KVM if and only if KVM is built as one or more modules. Link: https://lore.kernel.org/r/20250919003303.1355064-3-seanjc@google.com Cc: Nathan Chancellor Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/powerpc/include/asm/Kbuild | 1 - arch/powerpc/include/asm/kvm_types.h | 15 ++++ arch/x86/include/asm/kvm_types.h | 10 +++ include/linux/kvm_types.h | 25 +++++-- virt/kvm/eventfd.c | 2 +- virt/kvm/guest_memfd.c | 4 +- virt/kvm/kvm_main.c | 128 +++++++++++++++++------------------ 7 files changed, 110 insertions(+), 75 deletions(-) create mode 100644 arch/powerpc/include/asm/kvm_types.h (limited to 'include/linux') diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index e5fdc336c9b2..2e23533b67e3 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -3,7 +3,6 @@ generated-y += syscall_table_32.h generated-y += syscall_table_64.h generated-y += syscall_table_spu.h generic-y += agp.h -generic-y += kvm_types.h generic-y += mcs_spinlock.h generic-y += qrwlock.h generic-y += early_ioremap.h diff --git a/arch/powerpc/include/asm/kvm_types.h b/arch/powerpc/include/asm/kvm_types.h new file mode 100644 index 000000000000..5d4bffea7d47 --- /dev/null +++ b/arch/powerpc/include/asm/kvm_types.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_PPC_KVM_TYPES_H +#define _ASM_PPC_KVM_TYPES_H + +#if IS_MODULE(CONFIG_KVM_BOOK3S_64_PR) && IS_MODULE(CONFIG_KVM_BOOK3S_64_HV) +#define KVM_SUB_MODULES kvm-pr,kvm-hv +#elif IS_MODULE(CONFIG_KVM_BOOK3S_64_PR) +#define KVM_SUB_MODULES kvm-pr +#elif IS_MODULE(CONFIG_KVM_BOOK3S_64_HV) +#define KVM_SUB_MODULES kvm-hv +#else +#undef KVM_SUB_MODULES +#endif + +#endif diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h index 08f1b57d3b62..23268a188e70 100644 --- a/arch/x86/include/asm/kvm_types.h +++ b/arch/x86/include/asm/kvm_types.h @@ -2,6 +2,16 @@ #ifndef _ASM_X86_KVM_TYPES_H #define _ASM_X86_KVM_TYPES_H +#if IS_MODULE(CONFIG_KVM_AMD) && IS_MODULE(CONFIG_KVM_INTEL) +#define KVM_SUB_MODULES kvm-amd,kvm-intel +#elif IS_MODULE(CONFIG_KVM_AMD) +#define KVM_SUB_MODULES kvm-amd +#elif IS_MODULE(CONFIG_KVM_INTEL) +#define KVM_SUB_MODULES kvm-intel +#else +#undef KVM_SUB_MODULES +#endif + #define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 #endif /* _ASM_X86_KVM_TYPES_H */ diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 827ecc0b7e10..490464c205b4 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -3,6 +3,23 @@ #ifndef __KVM_TYPES_H__ #define __KVM_TYPES_H__ +#include +#include +#include +#include + +#ifdef KVM_SUB_MODULES +#define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) \ + EXPORT_SYMBOL_FOR_MODULES(symbol, __stringify(KVM_SUB_MODULES)) +#else +#define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) +#endif + +#ifndef __ASSEMBLER__ + +#include +#include + struct kvm; struct kvm_async_pf; struct kvm_device_ops; @@ -19,13 +36,6 @@ struct kvm_memslots; enum kvm_mr_change; -#include -#include -#include -#include - -#include - /* * Address types: * @@ -116,5 +126,6 @@ struct kvm_vcpu_stat_generic { }; #define KVM_STATS_NAME_SIZE 48 +#endif /* !__ASSEMBLER__ */ #endif /* __KVM_TYPES_H__ */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 6b1133a6617f..a7794ffdb976 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -525,7 +525,7 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) return false; } -EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_irq_has_notifier); void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) { diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 1d323ca178cb..94bafd6c558c 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -702,7 +702,7 @@ out: fput(file); return r; } -EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, @@ -785,5 +785,5 @@ put_folio_and_exit: fput(file); return ret && !i ? ret : i; } -EXPORT_SYMBOL_GPL(kvm_gmem_populate); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c0c5626ab71d..226faeaa8e56 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -77,22 +77,22 @@ MODULE_LICENSE("GPL"); /* Architectures should define their poll value according to the halt latency */ unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; module_param(halt_poll_ns, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns); /* Default doubles per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_grow = 2; module_param(halt_poll_ns_grow, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_grow); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow); /* The start value to grow halt_poll_ns from */ unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ module_param(halt_poll_ns_grow_start, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_grow_start); /* Default halves per-vcpu halt_poll_ns. */ unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); -EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(halt_poll_ns_shrink); /* * Allow direct access (from KVM or the CPU) without MMU notifier protection @@ -170,7 +170,7 @@ void vcpu_load(struct kvm_vcpu *vcpu) kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); } -EXPORT_SYMBOL_GPL(vcpu_load); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_load); void vcpu_put(struct kvm_vcpu *vcpu) { @@ -180,7 +180,7 @@ void vcpu_put(struct kvm_vcpu *vcpu) __this_cpu_write(kvm_running_vcpu, NULL); preempt_enable(); } -EXPORT_SYMBOL_GPL(vcpu_put); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(vcpu_put); /* TODO: merge with kvm_arch_vcpu_should_kick */ static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) @@ -288,7 +288,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) return called; } -EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_make_all_cpus_request); void kvm_flush_remote_tlbs(struct kvm *kvm) { @@ -309,7 +309,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.generic.remote_tlb_flush; } -EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_flush_remote_tlbs); void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { @@ -499,7 +499,7 @@ void kvm_destroy_vcpus(struct kvm *kvm) atomic_set(&kvm->online_vcpus, 0); } -EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_destroy_vcpus); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) @@ -1365,7 +1365,7 @@ void kvm_put_kvm_no_destroy(struct kvm *kvm) { WARN_ON(refcount_dec_and_test(&kvm->users_count)); } -EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_put_kvm_no_destroy); static int kvm_vm_release(struct inode *inode, struct file *filp) { @@ -1397,7 +1397,7 @@ out_unlock: } return -EINTR; } -EXPORT_SYMBOL_GPL(kvm_trylock_all_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_trylock_all_vcpus); int kvm_lock_all_vcpus(struct kvm *kvm) { @@ -1422,7 +1422,7 @@ out_unlock: } return r; } -EXPORT_SYMBOL_GPL(kvm_lock_all_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lock_all_vcpus); void kvm_unlock_all_vcpus(struct kvm *kvm) { @@ -1434,7 +1434,7 @@ void kvm_unlock_all_vcpus(struct kvm *kvm) kvm_for_each_vcpu(i, vcpu, kvm) mutex_unlock(&vcpu->mutex); } -EXPORT_SYMBOL_GPL(kvm_unlock_all_vcpus); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_unlock_all_vcpus); /* * Allocation size is twice as large as the actual dirty bitmap size. @@ -2142,7 +2142,7 @@ int kvm_set_internal_memslot(struct kvm *kvm, return kvm_set_memory_region(kvm, mem); } -EXPORT_SYMBOL_GPL(kvm_set_internal_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_set_internal_memslot); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region2 *mem) @@ -2201,7 +2201,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, *is_dirty = 1; return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dirty_log); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_dirty_log); #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ /** @@ -2636,7 +2636,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); } -EXPORT_SYMBOL_GPL(gfn_to_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -2670,7 +2670,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn return NULL; } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_memslot); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) { @@ -2678,7 +2678,7 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) return kvm_is_visible_memslot(memslot); } -EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_is_visible_gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -2686,7 +2686,7 @@ bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) return kvm_is_visible_memslot(memslot); } -EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_is_visible_gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -2743,19 +2743,19 @@ unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, { return gfn_to_hva_many(slot, gfn, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva_memslot); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); } -EXPORT_SYMBOL_GPL(gfn_to_hva); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(gfn_to_hva); unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) { return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_gfn_to_hva); /* * Return the hva of a @gfn and the R/W attribute if possible. @@ -2819,7 +2819,7 @@ void kvm_release_page_clean(struct page *page) kvm_set_page_accessed(page); put_page(page); } -EXPORT_SYMBOL_GPL(kvm_release_page_clean); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_clean); void kvm_release_page_dirty(struct page *page) { @@ -2829,7 +2829,7 @@ void kvm_release_page_dirty(struct page *page) kvm_set_page_dirty(page); kvm_release_page_clean(page); } -EXPORT_SYMBOL_GPL(kvm_release_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_release_page_dirty); static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, struct follow_pfnmap_args *map, bool writable) @@ -3073,7 +3073,7 @@ kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, return kvm_follow_pfn(&kfp); } -EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_faultin_pfn); int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages) @@ -3090,7 +3090,7 @@ int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } -EXPORT_SYMBOL_GPL(kvm_prefetch_pages); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_prefetch_pages); /* * Don't use this API unless you are absolutely, positively certain that KVM @@ -3112,7 +3112,7 @@ struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) (void)kvm_follow_pfn(&kfp); return refcounted_page; } -EXPORT_SYMBOL_GPL(__gfn_to_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__gfn_to_page); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, bool writable) @@ -3146,7 +3146,7 @@ int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, return map->hva ? 0 : -EFAULT; } -EXPORT_SYMBOL_GPL(__kvm_vcpu_map); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_map); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { @@ -3174,7 +3174,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) map->page = NULL; map->pinned_page = NULL; } -EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_unmap); static int next_segment(unsigned long len, int offset) { @@ -3210,7 +3210,7 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, return __kvm_read_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_read_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_page); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, int len) @@ -3219,7 +3219,7 @@ int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, return __kvm_read_guest_page(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_page); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) { @@ -3239,7 +3239,7 @@ int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) } return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest); int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) { @@ -3259,7 +3259,7 @@ int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned l } return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest); static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, void *data, int offset, unsigned long len) @@ -3290,7 +3290,7 @@ int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, return __kvm_read_guest_atomic(slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_read_guest_atomic); /* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */ static int __kvm_write_guest_page(struct kvm *kvm, @@ -3320,7 +3320,7 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_write_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_page); int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data, int offset, int len) @@ -3329,7 +3329,7 @@ int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest_page); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len) @@ -3350,7 +3350,7 @@ int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, } return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest); int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, unsigned long len) @@ -3371,7 +3371,7 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, } return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_write_guest); static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, struct gfn_to_hva_cache *ghc, @@ -3420,7 +3420,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, struct kvm_memslots *slots = kvm_memslots(kvm); return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } -EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gfn_to_hva_cache_init); int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, @@ -3451,14 +3451,14 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return 0; } -EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_offset_cached); int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_write_guest_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_write_guest_cached); int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned int offset, @@ -3488,14 +3488,14 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, return 0; } -EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_offset_cached); int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, void *data, unsigned long len) { return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_read_guest_cached); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_read_guest_cached); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) { @@ -3515,7 +3515,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } return 0; } -EXPORT_SYMBOL_GPL(kvm_clear_guest); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_clear_guest); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, @@ -3540,7 +3540,7 @@ void mark_page_dirty_in_slot(struct kvm *kvm, set_bit_le(rel_gfn, memslot->dirty_bitmap); } } -EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty_in_slot); void mark_page_dirty(struct kvm *kvm, gfn_t gfn) { @@ -3549,7 +3549,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn) memslot = gfn_to_memslot(kvm, gfn); mark_page_dirty_in_slot(kvm, memslot, gfn); } -EXPORT_SYMBOL_GPL(mark_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(mark_page_dirty); void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) { @@ -3558,7 +3558,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn); } -EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_mark_page_dirty); void kvm_sigset_activate(struct kvm_vcpu *vcpu) { @@ -3795,7 +3795,7 @@ out: trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_vcpu_halt); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_halt); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { @@ -3807,7 +3807,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) return false; } -EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_wake_up); #ifndef CONFIG_S390 /* @@ -3859,7 +3859,7 @@ void __kvm_vcpu_kick(struct kvm_vcpu *vcpu, bool wait) out: put_cpu(); } -EXPORT_SYMBOL_GPL(__kvm_vcpu_kick); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_vcpu_kick); #endif /* !CONFIG_S390 */ int kvm_vcpu_yield_to(struct kvm_vcpu *target) @@ -3882,7 +3882,7 @@ int kvm_vcpu_yield_to(struct kvm_vcpu *target) return ret; } -EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_yield_to); /* * Helper that checks whether a VCPU is eligible for directed yield. @@ -4037,7 +4037,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) /* Ensure vcpu is not eligible during next spinloop */ kvm_vcpu_set_dy_eligible(me, false); } -EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_vcpu_on_spin); static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff) { @@ -5019,7 +5019,7 @@ bool kvm_are_all_memslots_empty(struct kvm *kvm) return true; } -EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_are_all_memslots_empty); static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) @@ -5474,7 +5474,7 @@ bool file_is_kvm(struct file *file) { return file && file->f_op == &kvm_vm_fops; } -EXPORT_SYMBOL_GPL(file_is_kvm); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(file_is_kvm); static int kvm_dev_ioctl_create_vm(unsigned long type) { @@ -5569,10 +5569,10 @@ static struct miscdevice kvm_dev = { #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); -EXPORT_SYMBOL_GPL(enable_virt_at_load); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load); __visible bool kvm_rebooting; -EXPORT_SYMBOL_GPL(kvm_rebooting); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting); static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); @@ -5723,7 +5723,7 @@ err_cpuhp: --kvm_usage_count; return r; } -EXPORT_SYMBOL_GPL(kvm_enable_virtualization); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization); void kvm_disable_virtualization(void) { @@ -5736,7 +5736,7 @@ void kvm_disable_virtualization(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } -EXPORT_SYMBOL_GPL(kvm_disable_virtualization); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization); static int kvm_init_virtualization(void) { @@ -5885,7 +5885,7 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } -EXPORT_SYMBOL_GPL(kvm_io_bus_write); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_write); int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val, long cookie) @@ -5954,7 +5954,7 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } -EXPORT_SYMBOL_GPL(kvm_io_bus_read); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_read); static void __free_bus(struct rcu_head *rcu) { @@ -6078,7 +6078,7 @@ out_unlock: return iodev; } -EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_io_bus_get_dev); static int kvm_debugfs_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), @@ -6415,7 +6415,7 @@ struct kvm_vcpu *kvm_get_running_vcpu(void) return vcpu; } -EXPORT_SYMBOL_GPL(kvm_get_running_vcpu); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_get_running_vcpu); /** * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. @@ -6550,7 +6550,7 @@ err_cpu_kick_mask: kmem_cache_destroy(kvm_vcpu_cache); return r; } -EXPORT_SYMBOL_GPL(kvm_init); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init); void kvm_exit(void) { @@ -6573,4 +6573,4 @@ void kvm_exit(void) kvm_async_pf_deinit(); kvm_irqfd_exit(); } -EXPORT_SYMBOL_GPL(kvm_exit); +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_exit); -- cgit v1.2.3 From 25ba2b84c38f624151a3ba36e56d41c39b9223ad Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 19 Sep 2025 10:36:26 -0400 Subject: nfs/localio: avoid issuing misaligned IO using O_DIRECT Add nfsd_file_dio_alignment and use it to avoid issuing misaligned IO using O_DIRECT. Any misaligned DIO falls back to using buffered IO. Because misaligned DIO is now handled safely, remove the nfs modparam 'localio_O_DIRECT_semantics' that was added to require users opt-in to the requirement that all O_DIRECT be properly DIO-aligned. Also, introduce nfs_iov_iter_aligned_bvec() which is a variant of iov_iter_aligned_bvec() that also verifies the offset associated with an iov_iter is DIO-aligned. NOTE: in a parallel effort, iov_iter_aligned_bvec() is being removed along with iov_iter_is_aligned(). Lastly, add pr_info_ratelimited if underlying filesystem returns -EINVAL because it was made to try O_DIRECT for IO that is not DIO-aligned (shouldn't happen, so its best to be louder if it does). Fixes: 3feec68563d ("nfs/localio: add direct IO enablement with sync and async IO support") Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- fs/nfs/localio.c | 65 +++++++++++++++++++++++++++++++++++++++------- fs/nfsd/localio.c | 11 ++++++++ include/linux/nfslocalio.h | 2 ++ 3 files changed, 68 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 42ea50d42c99..b165922e5cb6 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -49,11 +49,6 @@ struct nfs_local_fsync_ctx { static bool localio_enabled __read_mostly = true; module_param(localio_enabled, bool, 0644); -static bool localio_O_DIRECT_semantics __read_mostly = false; -module_param(localio_O_DIRECT_semantics, bool, 0644); -MODULE_PARM_DESC(localio_O_DIRECT_semantics, - "LOCALIO will use O_DIRECT semantics to filesystem."); - static inline bool nfs_client_is_local(const struct nfs_client *clp) { return !!rcu_access_pointer(clp->cl_uuid.net); @@ -322,12 +317,9 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, return NULL; } - if (localio_O_DIRECT_semantics && - test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { - iocb->kiocb.ki_filp = file; + init_sync_kiocb(&iocb->kiocb, file); + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) iocb->kiocb.ki_flags = IOCB_DIRECT; - } else - init_sync_kiocb(&iocb->kiocb, file); iocb->kiocb.ki_pos = hdr->args.offset; iocb->hdr = hdr; @@ -337,6 +329,30 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, return iocb; } +static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, + loff_t offset, unsigned int addr_mask, unsigned int len_mask) +{ + const struct bio_vec *bvec = i->bvec; + size_t skip = i->iov_offset; + size_t size = i->count; + + if ((offset | size) & len_mask) + return false; + do { + size_t len = bvec->bv_len; + + if (len > size) + len = size; + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) + return false; + bvec++; + size -= len; + skip = 0; + } while (size); + + return true; +} + static void nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir) { @@ -346,6 +362,25 @@ nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir) hdr->args.count + hdr->args.pgbase); if (hdr->args.pgbase != 0) iov_iter_advance(i, hdr->args.pgbase); + + if (iocb->kiocb.ki_flags & IOCB_DIRECT) { + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; + /* Verify the IO is DIO-aligned as required */ + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, + &nf_dio_offset_align, + &nf_dio_read_offset_align); + if (dir == READ) + nf_dio_offset_align = nf_dio_read_offset_align; + + if (nf_dio_mem_align && nf_dio_offset_align && + nfs_iov_iter_aligned_bvec(i, hdr->args.offset, + nf_dio_mem_align - 1, + nf_dio_offset_align - 1)) + return; /* is DIO-aligned */ + + /* Fallback to using buffered for this misaligned IO */ + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + } } static void @@ -406,6 +441,11 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) struct nfs_pgio_header *hdr = iocb->hdr; struct file *filp = iocb->kiocb.ki_filp; + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n"); + } + nfs_local_pgio_done(hdr, status); /* @@ -598,6 +638,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n"); + } + /* Handle short writes as if they are ENOSPC */ if (status > 0 && status < hdr->args.count) { hdr->mds_offset += status; diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index cb237f1b902a..9e0a37cd29d8 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -117,6 +117,16 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, return localio; } +static void nfsd_file_dio_alignment(struct nfsd_file *nf, + u32 *nf_dio_mem_align, + u32 *nf_dio_offset_align, + u32 *nf_dio_read_offset_align) +{ + *nf_dio_mem_align = nf->nf_dio_mem_align; + *nf_dio_offset_align = nf->nf_dio_offset_align; + *nf_dio_read_offset_align = nf->nf_dio_read_offset_align; +} + static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_net_try_get = nfsd_net_try_get, .nfsd_net_put = nfsd_net_put, @@ -124,6 +134,7 @@ static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_file_put_local = nfsd_file_put_local, .nfsd_file_get_local = nfsd_file_get_local, .nfsd_file_file = nfsd_file_file, + .nfsd_file_dio_alignment = nfsd_file_dio_alignment, }; void nfsd_localio_ops_init(void) diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 5c7c92659e73..7ca2715edccc 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -65,6 +65,8 @@ struct nfsd_localio_operations { struct net *(*nfsd_file_put_local)(struct nfsd_file __rcu **); struct nfsd_file *(*nfsd_file_get_local)(struct nfsd_file *); struct file *(*nfsd_file_file)(struct nfsd_file *); + void (*nfsd_file_dio_alignment)(struct nfsd_file *, + u32 *, u32 *, u32 *); } ____cacheline_aligned; extern void nfsd_localio_ops_init(void); -- cgit v1.2.3 From 6d0386ea99875313fdfd074eb74013b6e3b48a76 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 Aug 2025 17:01:53 -0700 Subject: entry/kvm: KVM: Move KVM details related to signal/-EINTR into KVM proper Move KVM's morphing of pending signals into userspace exits into KVM proper, and drop the @vcpu param from xfer_to_guest_mode_handle_work(). How KVM responds to -EINTR is a detail that really belongs in KVM itself, and invoking kvm_handle_signal_exit() from kernel code creates an inverted module dependency. E.g. attempting to move kvm_handle_signal_exit() into kvm_main.c would generate an linker error when building kvm.ko as a module. Dropping KVM details will also converting the KVM "entry" code into a more generic virtualization framework so that it can be used when running as a Hyper-V root partition. Lastly, eliminating usage of "struct kvm_vcpu" outside of KVM is also nice to have for KVM x86 developers, as keeping the details of kvm_vcpu purely within KVM allows changing the layout of the structure without having to boot into a new kernel, e.g. allows rebuilding and reloading kvm.ko with a modified kvm_vcpu structure as part of debug/development. Signed-off-by: Sean Christopherson Reviewed-by: Thomas Gleixner Signed-off-by: Wei Liu --- arch/arm64/kvm/arm.c | 3 +-- arch/loongarch/kvm/vcpu.c | 3 +-- arch/riscv/kvm/vcpu.c | 3 +-- arch/x86/kvm/vmx/vmx.c | 1 - arch/x86/kvm/x86.c | 3 +-- include/linux/entry-kvm.h | 11 +++-------- include/linux/kvm_host.h | 13 ++++++++++++- kernel/entry/kvm.c | 13 +++++-------- 8 files changed, 24 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 888f7c7abf54..418fd3043467 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -6,7 +6,6 @@ #include #include -#include #include #include #include @@ -1177,7 +1176,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (!ret) ret = 1; diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index d1b8c50941ca..514256b25ba1 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -4,7 +4,6 @@ */ #include -#include #include #include #include @@ -251,7 +250,7 @@ static int kvm_enter_guest_check(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (ret < 0) return ret; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index f001e56403f9..251e787f2ebc 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -7,7 +7,6 @@ */ #include -#include #include #include #include @@ -910,7 +909,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) run->exit_reason = KVM_EXIT_UNKNOWN; while (ret > 0) { /* Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (ret) continue; ret = 1; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aa157fe5b7b3..d7c86613e50a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a1c49bc681c4..0b13b8bf69e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include @@ -11241,7 +11240,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (__xfer_to_guest_mode_work_pending()) { kvm_vcpu_srcu_read_unlock(vcpu); - r = xfer_to_guest_mode_handle_work(vcpu); + r = kvm_xfer_to_guest_mode_handle_work(vcpu); kvm_vcpu_srcu_read_lock(vcpu); if (r) return r; diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 16149f6625e4..3644de7e6019 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -21,8 +21,6 @@ _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ ARCH_XFER_TO_GUEST_MODE_WORK) -struct kvm_vcpu; - /** * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest * mode work handling function. @@ -32,12 +30,10 @@ struct kvm_vcpu; * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be * replaced by architecture specific code. */ -static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, - unsigned long ti_work); +static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work); #ifndef arch_xfer_to_guest_mode_work -static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, - unsigned long ti_work) +static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work) { return 0; } @@ -46,11 +42,10 @@ static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, /** * xfer_to_guest_mode_handle_work - Check and handle pending work which needs * to be handled before going to guest mode - * @vcpu: Pointer to current's VCPU data * * Returns: 0 or an error code */ -int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu); +int xfer_to_guest_mode_handle_work(void); /** * xfer_to_guest_mode_prepare - Perform last minute preparation work that diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 15656b7fba6c..598b9473e46d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2,7 +2,7 @@ #ifndef __KVM_HOST_H #define __KVM_HOST_H - +#include #include #include #include @@ -2450,6 +2450,17 @@ static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_INTR; vcpu->stat.signal_exits++; } + +static inline int kvm_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +{ + int r = xfer_to_guest_mode_handle_work(); + + if (r) { + WARN_ON_ONCE(r != -EINTR); + kvm_handle_signal_exit(vcpu); + } + return r; +} #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ /* diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 8485f63863af..6fc762eaacca 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -1,17 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include -static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) +static int xfer_to_guest_mode_work(unsigned long ti_work) { do { int ret; - if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { - kvm_handle_signal_exit(vcpu); + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) return -EINTR; - } if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); @@ -19,7 +16,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ti_work & _TIF_NOTIFY_RESUME) resume_user_mode_work(NULL); - ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); + ret = arch_xfer_to_guest_mode_handle_work(ti_work); if (ret) return ret; @@ -28,7 +25,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) return 0; } -int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) +int xfer_to_guest_mode_handle_work(void) { unsigned long ti_work; @@ -44,6 +41,6 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) return 0; - return xfer_to_guest_mode_work(vcpu, ti_work); + return xfer_to_guest_mode_work(ti_work); } EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); -- cgit v1.2.3 From 9be7e1e320ff2e7db4b23c8ec5f599bbfac94ede Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 27 Aug 2025 17:01:54 -0700 Subject: entry: Rename "kvm" entry code assets to "virt" to genericize APIs Rename the "kvm" entry code files and Kconfigs to use generic "virt" nomenclature so that the code can be reused by other hypervisors (or rather, their root/dom0 partition drivers), without incorrectly suggesting the code somehow relies on and/or involves KVM. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Thomas Gleixner Reviewed-by: Joel Fernandes Signed-off-by: Wei Liu --- MAINTAINERS | 2 +- arch/arm64/kvm/Kconfig | 2 +- arch/loongarch/kvm/Kconfig | 2 +- arch/riscv/kvm/Kconfig | 2 +- arch/x86/kvm/Kconfig | 2 +- include/linux/entry-kvm.h | 95 ---------------------------------------------- include/linux/entry-virt.h | 95 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 6 +-- include/linux/rcupdate.h | 2 +- kernel/entry/Makefile | 2 +- kernel/entry/kvm.c | 46 ---------------------- kernel/entry/virt.c | 46 ++++++++++++++++++++++ kernel/rcu/tree.c | 6 +-- virt/kvm/Kconfig | 2 +- 14 files changed, 155 insertions(+), 155 deletions(-) delete mode 100644 include/linux/entry-kvm.h create mode 100644 include/linux/entry-virt.h delete mode 100644 kernel/entry/kvm.c create mode 100644 kernel/entry/virt.c (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index fe168477caa4..c255048333f0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10200,7 +10200,7 @@ L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/entry F: include/linux/entry-common.h -F: include/linux/entry-kvm.h +F: include/linux/entry-virt.h F: include/linux/irq-entry-common.h F: kernel/entry/ diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 713248f240e0..6f4fc3caa31a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -25,7 +25,7 @@ menuconfig KVM select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select KVM_VFIO select HAVE_KVM_DIRTY_RING_ACQ_REL select NEED_KVM_DIRTY_RING_WITH_BITMAP diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index 40eea6da7c25..ae64bbdf83a7 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -31,7 +31,7 @@ config KVM select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_MMU_NOTIFIER select KVM_MMIO - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS help diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 5a62091b0809..c50328212917 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -30,7 +30,7 @@ config KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING select KVM_MMIO - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select KVM_GENERIC_MMU_NOTIFIER select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 2c86673155c9..f81074b0c0a8 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -40,7 +40,7 @@ config KVM_X86 select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select HAVE_KVM_PM_NOTIFIER if PM diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h deleted file mode 100644 index 3644de7e6019..000000000000 --- a/include/linux/entry-kvm.h +++ /dev/null @@ -1,95 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_ENTRYKVM_H -#define __LINUX_ENTRYKVM_H - -#include -#include -#include -#include -#include -#include - -/* Transfer to guest mode work */ -#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK - -#ifndef ARCH_XFER_TO_GUEST_MODE_WORK -# define ARCH_XFER_TO_GUEST_MODE_WORK (0) -#endif - -#define XFER_TO_GUEST_MODE_WORK \ - (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \ - _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ - ARCH_XFER_TO_GUEST_MODE_WORK) - -/** - * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest - * mode work handling function. - * @vcpu: Pointer to current's VCPU data - * @ti_work: Cached TIF flags gathered in xfer_to_guest_mode_handle_work() - * - * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be - * replaced by architecture specific code. - */ -static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work); - -#ifndef arch_xfer_to_guest_mode_work -static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work) -{ - return 0; -} -#endif - -/** - * xfer_to_guest_mode_handle_work - Check and handle pending work which needs - * to be handled before going to guest mode - * - * Returns: 0 or an error code - */ -int xfer_to_guest_mode_handle_work(void); - -/** - * xfer_to_guest_mode_prepare - Perform last minute preparation work that - * need to be handled while IRQs are disabled - * upon entering to guest. - * - * Has to be invoked with interrupts disabled before the last call - * to xfer_to_guest_mode_work_pending(). - */ -static inline void xfer_to_guest_mode_prepare(void) -{ - lockdep_assert_irqs_disabled(); - tick_nohz_user_enter_prepare(); -} - -/** - * __xfer_to_guest_mode_work_pending - Check if work is pending - * - * Returns: True if work pending, False otherwise. - * - * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from - * interrupt enabled code for racy quick checks with care. - */ -static inline bool __xfer_to_guest_mode_work_pending(void) -{ - unsigned long ti_work = read_thread_flags(); - - return !!(ti_work & XFER_TO_GUEST_MODE_WORK); -} - -/** - * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be - * handled before returning to guest mode - * - * Returns: True if work pending, False otherwise. - * - * Has to be invoked with interrupts disabled before the transition to - * guest mode. - */ -static inline bool xfer_to_guest_mode_work_pending(void) -{ - lockdep_assert_irqs_disabled(); - return __xfer_to_guest_mode_work_pending(); -} -#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ - -#endif diff --git a/include/linux/entry-virt.h b/include/linux/entry-virt.h new file mode 100644 index 000000000000..42c89e3e5ca7 --- /dev/null +++ b/include/linux/entry-virt.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_ENTRYVIRT_H +#define __LINUX_ENTRYVIRT_H + +#include +#include +#include +#include +#include +#include + +/* Transfer to guest mode work */ +#ifdef CONFIG_VIRT_XFER_TO_GUEST_WORK + +#ifndef ARCH_XFER_TO_GUEST_MODE_WORK +# define ARCH_XFER_TO_GUEST_MODE_WORK (0) +#endif + +#define XFER_TO_GUEST_MODE_WORK \ + (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \ + _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ + ARCH_XFER_TO_GUEST_MODE_WORK) + +/** + * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest + * mode work handling function. + * @vcpu: Pointer to current's VCPU data + * @ti_work: Cached TIF flags gathered in xfer_to_guest_mode_handle_work() + * + * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be + * replaced by architecture specific code. + */ +static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work); + +#ifndef arch_xfer_to_guest_mode_work +static inline int arch_xfer_to_guest_mode_handle_work(unsigned long ti_work) +{ + return 0; +} +#endif + +/** + * xfer_to_guest_mode_handle_work - Check and handle pending work which needs + * to be handled before going to guest mode + * + * Returns: 0 or an error code + */ +int xfer_to_guest_mode_handle_work(void); + +/** + * xfer_to_guest_mode_prepare - Perform last minute preparation work that + * need to be handled while IRQs are disabled + * upon entering to guest. + * + * Has to be invoked with interrupts disabled before the last call + * to xfer_to_guest_mode_work_pending(). + */ +static inline void xfer_to_guest_mode_prepare(void) +{ + lockdep_assert_irqs_disabled(); + tick_nohz_user_enter_prepare(); +} + +/** + * __xfer_to_guest_mode_work_pending - Check if work is pending + * + * Returns: True if work pending, False otherwise. + * + * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from + * interrupt enabled code for racy quick checks with care. + */ +static inline bool __xfer_to_guest_mode_work_pending(void) +{ + unsigned long ti_work = read_thread_flags(); + + return !!(ti_work & XFER_TO_GUEST_MODE_WORK); +} + +/** + * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be + * handled before returning to guest mode + * + * Returns: True if work pending, False otherwise. + * + * Has to be invoked with interrupts disabled before the transition to + * guest mode. + */ +static inline bool xfer_to_guest_mode_work_pending(void) +{ + lockdep_assert_irqs_disabled(); + return __xfer_to_guest_mode_work_pending(); +} +#endif /* CONFIG_VIRT_XFER_TO_GUEST_WORK */ + +#endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 598b9473e46d..70ac2267d5d0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2,7 +2,7 @@ #ifndef __KVM_HOST_H #define __KVM_HOST_H -#include +#include #include #include #include @@ -2444,7 +2444,7 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ -#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK +#ifdef CONFIG_VIRT_XFER_TO_GUEST_WORK static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) { vcpu->run->exit_reason = KVM_EXIT_INTR; @@ -2461,7 +2461,7 @@ static inline int kvm_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) } return r; } -#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */ +#endif /* CONFIG_VIRT_XFER_TO_GUEST_WORK */ /* * If more than one page is being (un)accounted, @virt must be the address of diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 120536f4c6eb..1e1f3aa375d9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -129,7 +129,7 @@ static inline void rcu_sysrq_start(void) { } static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ -#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) +#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) void rcu_irq_work_resched(void); #else static __always_inline void rcu_irq_work_resched(void) { } diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 77fcd83dd663..2333d70802e4 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -14,4 +14,4 @@ CFLAGS_common.o += -fno-stack-protector obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o -obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o +obj-$(CONFIG_VIRT_XFER_TO_GUEST_WORK) += virt.o diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c deleted file mode 100644 index 6fc762eaacca..000000000000 --- a/kernel/entry/kvm.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include - -static int xfer_to_guest_mode_work(unsigned long ti_work) -{ - do { - int ret; - - if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - return -EINTR; - - if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) - schedule(); - - if (ti_work & _TIF_NOTIFY_RESUME) - resume_user_mode_work(NULL); - - ret = arch_xfer_to_guest_mode_handle_work(ti_work); - if (ret) - return ret; - - ti_work = read_thread_flags(); - } while (ti_work & XFER_TO_GUEST_MODE_WORK); - return 0; -} - -int xfer_to_guest_mode_handle_work(void) -{ - unsigned long ti_work; - - /* - * This is invoked from the outer guest loop with interrupts and - * preemption enabled. - * - * KVM invokes xfer_to_guest_mode_work_pending() with interrupts - * disabled in the inner loop before going into guest mode. No need - * to disable interrupts here. - */ - ti_work = read_thread_flags(); - if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) - return 0; - - return xfer_to_guest_mode_work(ti_work); -} -EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); diff --git a/kernel/entry/virt.c b/kernel/entry/virt.c new file mode 100644 index 000000000000..c52f99249763 --- /dev/null +++ b/kernel/entry/virt.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +static int xfer_to_guest_mode_work(unsigned long ti_work) +{ + do { + int ret; + + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) + return -EINTR; + + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) + schedule(); + + if (ti_work & _TIF_NOTIFY_RESUME) + resume_user_mode_work(NULL); + + ret = arch_xfer_to_guest_mode_handle_work(ti_work); + if (ret) + return ret; + + ti_work = read_thread_flags(); + } while (ti_work & XFER_TO_GUEST_MODE_WORK); + return 0; +} + +int xfer_to_guest_mode_handle_work(void) +{ + unsigned long ti_work; + + /* + * This is invoked from the outer guest loop with interrupts and + * preemption enabled. + * + * KVM invokes xfer_to_guest_mode_work_pending() with interrupts + * disabled in the inner loop before going into guest mode. No need + * to disable interrupts here. + */ + ti_work = read_thread_flags(); + if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) + return 0; + + return xfer_to_guest_mode_work(ti_work); +} +EXPORT_SYMBOL_GPL(xfer_to_guest_mode_handle_work); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 174ee243b349..995489b72535 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -573,7 +573,7 @@ void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len) } EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs); -#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) +#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on * IRQ tail once IRQs get re-enabled on userspace/guest resume. @@ -602,7 +602,7 @@ noinstr void rcu_irq_work_resched(void) if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) return; - if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) + if (IS_ENABLED(CONFIG_VIRT_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) return; instrumentation_begin(); @@ -611,7 +611,7 @@ noinstr void rcu_irq_work_resched(void) } instrumentation_end(); } -#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */ +#endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_VIRT_XFER_TO_GUEST_WORK)) */ #ifdef CONFIG_PROVE_RCU /** diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 727b542074e7..ce843db53831 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -87,7 +87,7 @@ config HAVE_KVM_VCPU_RUN_PID_CHANGE config HAVE_KVM_NO_POLL bool -config KVM_XFER_TO_GUEST_WORK +config VIRT_XFER_TO_GUEST_WORK bool config HAVE_KVM_PM_NOTIFIER -- cgit v1.2.3 From 1a98f5699bd57c9b3f66ec54cc38571d5e42ffb1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Sep 2025 15:45:06 +0200 Subject: Revert "Documentation: net: add flow control guide and document ethtool API" This reverts commit 7bd80ed89d72285515db673803b021469ba71ee8. I should not have merged it to begin with due to pending review and changes to be addressed. Link: https://patch.msgid.link/c6f3af12df9b7998920a02027fc8893ce82afc4c.1759239721.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 -- Documentation/networking/flow_control.rst | 373 ------------------------- Documentation/networking/index.rst | 1 - Documentation/networking/phy.rst | 12 +- include/linux/ethtool.h | 45 +-- include/uapi/linux/ethtool_netlink_generated.h | 4 +- net/dcb/dcbnl.c | 2 - net/ethtool/pause.c | 4 - 8 files changed, 15 insertions(+), 453 deletions(-) delete mode 100644 Documentation/networking/flow_control.rst (limited to 'include/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index e4852505294f..6a0fb1974513 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -864,9 +864,7 @@ attribute-sets: - name: pause-stat - doc: Statistics counters for link-wide PAUSE frames (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-stat-cnt - enum-name: ethtool-a-pause-stat attributes: - name: unspec @@ -877,17 +875,13 @@ attribute-sets: type: pad - name: tx-frames - doc: Number of PAUSE frames transmitted. type: u64 - name: rx-frames - doc: Number of PAUSE frames received. type: u64 - name: pause - doc: Parameters for link-wide PAUSE (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-cnt - enum-name: ethtool-a-pause attributes: - name: unspec @@ -899,40 +893,19 @@ attribute-sets: nested-attributes: header - name: autoneg - doc: | - Acts as a mode selector for the driver. - On GET: indicates the driver's behavior. If true, the driver will - respect the negotiated outcome; if false, the driver will use a - forced configuration. - On SET: if true, the driver configures the PHY's advertisement based - on the rx and tx attributes. If false, the driver forces the MAC - into the state defined by the rx and tx attributes. type: u8 - name: rx - doc: | - Enable receiving PAUSE frames (pausing local TX). - On GET: reflects the currently preferred configuration state. type: u8 - name: tx - doc: | - Enable transmitting PAUSE frames (pausing peer TX). - On GET: reflects the currently preferred configuration state. type: u8 - name: stats - doc: | - Contains the pause statistics counters. The source of these - statistics is determined by stats-src. type: nest nested-attributes: pause-stat - name: stats-src - doc: | - Selects the source of the MAC statistics, values from - enum ethtool_mac_stats_src. This allows requesting statistics - from the individual components of the MAC Merge layer. type: u32 - name: eee diff --git a/Documentation/networking/flow_control.rst b/Documentation/networking/flow_control.rst deleted file mode 100644 index 48646d54513f..000000000000 --- a/Documentation/networking/flow_control.rst +++ /dev/null @@ -1,373 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _ethernet-flow-control: - -===================== -Ethernet Flow Control -===================== - -This document is a practical guide to Ethernet Flow Control in Linux, covering -what it is, how it works, and how to configure it. - -What is Flow Control? -===================== - -Flow control is a mechanism to prevent a fast sender from overwhelming a -slow receiver with data, which would cause buffer overruns and dropped packets. -The receiver can signal the sender to temporarily stop transmitting, giving it -time to process its backlog. - -Standards references -==================== - -Ethernet flow control mechanisms are specified across consolidated IEEE base -standards; some originated as amendments: - -- Collision-based flow control is part of CSMA/CD in **IEEE 802.3** - (half-duplex). -- Link-wide PAUSE is defined in **IEEE 802.3 Annex 31B** - (originally **802.3x**). -- Priority-based Flow Control (PFC) is defined in **IEEE 802.1Q Clause 36** - (originally **802.1Qbb**). - -In the remainder of this document, the consolidated clause numbers are used. - -How It Works: The Mechanisms -============================ - -The method used for flow control depends on the link's duplex mode. - -.. note:: - The user-visible ``ethtool`` pause API described in this document controls - **link-wide PAUSE** (IEEE 802.3 Annex 31B) only. It does not control the - collision-based behavior that exists on half-duplex links. - -1. Half-Duplex: Collision-Based Flow Control --------------------------------------------- -On half-duplex links, a device cannot send and receive simultaneously, so PAUSE -frames are not used. Flow control is achieved by leveraging the CSMA/CD -(Carrier Sense Multiple Access with Collision Detection) protocol itself. - -* **How it works**: To inhibit incoming data, a receiving device can force a - collision on the line. When the sending station detects this collision, it - terminates its transmission, sends a "jam" signal, and then executes the - "Collision backoff and retransmission" procedure as defined in IEEE 802.3, - Section 4.2.3.2.5. This algorithm makes the sender wait for a random - period before attempting to retransmit. By repeatedly forcing collisions, - the receiver can effectively throttle the sender's transmission rate. - -.. note:: - While this mechanism is part of the IEEE standard, there is currently no - generic kernel API to configure or control it. Drivers should not enable - this feature until a standardized interface is available. - -.. warning:: - On shared-medium networks (e.g. 10BASE2, or twisted-pair networks using a - hub rather than a switch) forcing collisions inhibits traffic **across the - entire shared segment**, not just a single point-to-point link. Enabling - such behavior is generally undesirable. - -2. Full-Duplex: Link-wide PAUSE (IEEE 802.3 Annex 31B) ------------------------------------------------------- -On full-duplex links, devices can send and receive at the same time. Flow -control is achieved by sending a special **PAUSE frame**, defined by IEEE -802.3 Annex 31B. This mechanism pauses all traffic on the link and is therefore -called *link-wide PAUSE*. - -* **What it is**: A standard Ethernet frame with a globally reserved - destination MAC address (``01-80-C2-00-00-01``). This address is in a range - that standard IEEE 802.1D-compliant bridges do not forward. However, some - unmanaged or misconfigured bridges have been reported to forward these - frames, which can disrupt flow control across a network. - -* **How it works**: The frame contains a MAC Control opcode for PAUSE - (``0x0001``) and a ``pause_time`` value, telling the sender how long to - wait before sending more data frames. This time is specified in units of - "pause quantum", where one quantum is the time it takes to transmit 512 bits. - For example, one pause quantum is 51.2 microseconds on a 10 Mbit/s link, - and 512 nanoseconds on a 1 Gbit/s link. A ``pause_time`` of zero indicates - that the transmitter can resume transmission, even if a previous non-zero - pause time has not yet elapsed. - -* **Who uses it**: Any full-duplex link, from 10 Mbit/s to multi-gigabit speeds. - -3. Full-Duplex: Priority-based Flow Control (PFC) (IEEE 802.1Q Clause 36) -------------------------------------------------------------------------- -Priority-based Flow Control is an enhancement to the standard PAUSE mechanism -that allows flow control to be applied independently to different classes of -traffic, identified by their priority level. - -* **What it is**: PFC allows a receiver to pause traffic for one or more of the - 8 standard priority levels without stopping traffic for other priorities. - This is critical in data center environments for protocols that cannot - tolerate packet loss due to congestion (e.g., Fibre Channel over Ethernet - or RoCE). - -* **How it works**: PFC uses a specific PAUSE frame format. It shares the same - globally reserved destination MAC address (``01-80-C2-00-00-01``) as legacy - PAUSE frames but uses a unique opcode (``0x0101``). The frame payload - contains two key fields: - - - **``priority_enable_vector``**: An 8-bit mask where each bit corresponds to - one of the 8 priorities. If a bit is set to 1, it means the pause time - for that priority is active. - - **``time_vector``**: A list of eight 2-octet fields, one for each priority. - Each field specifies the ``pause_time`` for its corresponding priority, - measured in units of ``pause_quanta`` (the time to transmit 512 bits). - -.. note:: - When PFC is enabled for at least one priority on a port, the standard - **link-wide PAUSE** (IEEE 802.3 Annex 31B) must be disabled for that port. - The two mechanisms are mutually exclusive (IEEE 802.1Q Clause 36). - -Configuring Flow Control -======================== - -Link-wide PAUSE and Priority-based Flow Control are configured with different -tools. - -Configuring Link-wide PAUSE with ``ethtool`` (IEEE 802.3 Annex 31B) -------------------------------------------------------------------- -Use ``ethtool -a `` to view and ``ethtool -A `` to change -the link-wide PAUSE settings. - -.. code-block:: bash - - # View current link-wide PAUSE settings - ethtool -a eth0 - - # Enable RX and TX pause, with autonegotiation - ethtool -A eth0 autoneg on rx on tx on - -**Key Configuration Concepts**: - -* **Pause Autoneg vs Generic Autoneg**: ``ethtool -A ... autoneg {on,off}`` - controls **Pause Autoneg** (Annex 31B) only. It is independent from the - **Generic link autonegotiation** configured with ``ethtool -s``. A device can - have Generic autoneg **on** while Pause Autoneg is **off**, and vice versa. - -* **If Pause Autoneg is off** (``-A ... autoneg off``): the device will **not** - advertise pause in the PHY. The MAC PAUSE state is **forced** according to - ``rx``/``tx`` and does not depend on partner capabilities or resolution. - Ensure the peer is configured complementarily for PAUSE to be effective. - -* **If generic autoneg is off** but **Pause Autoneg is on**, the pause policy - is **remembered** by the kernel and applied later when Generic autoneg is - enabled again. - -* **Autonegotiation Mode**: The PHY will *advertise* the ``rx`` and ``tx`` - capabilities. The final active state is determined by what both sides of the - link agree on. See the "PHY (Physical Layer Transceiver)" section below, - especially the *Resolution* subsection, for details of the negotiation rules. - -* **Forced Mode**: This mode is necessary when autonegotiation is not used or - not possible. This includes links where one or both partners have - autonegotiation disabled, or in setups without a PHY (e.g., direct - MAC-to-MAC connections). The driver bypasses PHY advertisement and - directly forces the MAC into the specified ``rx``/``tx`` state. The - configuration on both sides of the link must be complementary. For - example, if one side is set to ``tx on`` ``rx off``, the link partner must be - set to ``tx off`` ``rx on`` for flow control to function correctly. - -Configuring PFC with ``dcb`` (IEEE 802.1Q Clause 36) ----------------------------------------------------- -PFC is part of the Data Center Bridging (DCB) subsystem and is managed with the -``dcb`` tool (iproute2). Some deployments use ``dcbtool`` (lldpad) instead; this -document shows ``dcb(8)`` examples. - -**Viewing PFC Settings**: - -.. code-block:: text - - $ dcb pfc show dev eth0 - pfc-cap 8 macsec-bypass off delay 4096 - prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on - -This shows the PFC state (on/off) for each priority (0-7). - -**Changing PFC Settings**: - -.. code-block:: bash - - # Enable PFC on priorities 6 and 7, leaving others as they are - $ dcb pfc set dev eth0 prio-pfc 6:on 7:on - - # Disable PFC for all priorities except 6 and 7 - $ dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on - -Monitoring Flow Control -======================= - -The standard way to check if flow control is actively being used is to view the -pause-related statistics. - -**Monitoring Link-wide PAUSE**: -Use ``ethtool --include-statistics -a ``. - -.. code-block:: text - - $ ethtool --include-statistics -a eth0 - Pause parameters for eth0: - ... - Statistics: - tx_pause_frames: 0 - rx_pause_frames: 0 - -**Monitoring PFC**: -PFC statistics (sent and received frames per priority) are available -through the ``dcb`` tool. - -.. code-block:: text - - $ dcb pfc show dev eth0 requests indications - requests 0:0 1:0 2:0 3:1024 4:2048 5:0 6:0 7:0 - indications 0:0 1:0 2:0 3:512 4:4096 5:0 6:0 7:0 - -The ``requests`` counters track transmitted PFC frames (TX), and the -``indications`` counters track received PFC frames (RX). - -Link-wide PAUSE Autonegotiation Details -======================================= - -The autonegotiation process for link-wide PAUSE is managed by the PHY and -involves advertising capabilities and resolving the outcome. - -* Terminology (link-wide PAUSE): - - - **Symmetric pause**: both directions are paused when requested (TX+RX - enabled). - - **Asymmetric pause**: only one direction is paused (e.g., RX-only or - TX-only). - - In IEEE 802.3 advertisement/resolution, symmetric/asymmetric are encoded - using two bits (Pause/Asym) and resolved per the standard truth tables - below. - -* **Advertisement**: The PHY advertises the MAC's flow control capabilities. - This is done using two bits in the advertisement register: "Symmetric - Pause" (Pause) and "Asymmetric Pause" (Asym). These bits should be - interpreted as a combined value, not as independent flags. The kernel - converts the user's ``rx`` and ``tx`` settings into this two-bit value as - follows: - - .. code-block:: text - - tx rx | Pause Asym - -------+------------- - 0 0 | 0 0 - 0 1 | 1 1 - 1 0 | 0 1 - 1 1 | 1 0 - -* **Resolution**: After negotiation, the PHY reports the link partner's - advertised Pause and Asym bits. The final flow control mode is determined - by the combination of the local and partner advertisements, according to - the IEEE 802.3 standard: - - .. code-block:: text - - Local Device | Link Partner | Result - Pause Asym | Pause Asym | - -------------------+--------------------+--------- - 0 X | 0 X | Disabled - 0 1 | 1 0 | Disabled - 0 1 | 1 1 | TX only - 1 0 | 0 X | Disabled - 1 X | 1 X | TX + RX - 1 1 | 0 1 | RX only - - It is important to note that the advertised bits reflect the *current - configuration* of the MAC, which may not represent its full hardware - capabilities. - -Kernel Policy: "Set and Trust" -============================== - -The ethtool pause API is defined as a **wish policy** for -IEEE 802.3 link-wide PAUSE only. A user request is always accepted -as the preferred configuration, but it may not be possible to apply -it in all link states. - -Key constraints: - -- Link-wide PAUSE is not valid on half-duplex links. -- Link-wide PAUSE cannot be used together with Priority-based Flow Control - (PFC, IEEE 802.1Q Clause 36). -- If autonegotiation is active and the link is currently down, the future - mode is not yet known. - -Because of these constraints, the kernel stores the requested setting -and applies it only when the link is in a compatible state. - -Implications for userspace: - -1. Set once (the "wish"): the requested Rx/Tx PAUSE policy is - remembered even if it cannot be applied immediately. -2. Applied conditionally: when the link comes up, the kernel enables - PAUSE only if the active mode allows it. - -Component Roles in Flow Control -=============================== - -The configuration of flow control involves several components, each with a -distinct role. - -The MAC (Media Access Controller) ---------------------------------- -The MAC is the hardware component that actually sends and receives PAUSE -frames. Its capabilities define the upper limit of what the driver can support. -For link-wide PAUSE, MACs can vary in their support for symmetric (both -directions) or asymmetric (independent TX/RX) flow control. - -For PFC, the MAC must be capable of generating and interpreting the -priority-based PAUSE frames and managing separate pause states for each -traffic class. - -Many MACs also implement automatic PAUSE frame transmission based on the fill -level of their internal RX FIFO. This is typically configured with two -thresholds: - -* **FLOW_ON (High Water Mark)**: When the RX FIFO usage reaches this - threshold, the MAC automatically transmits a PAUSE frame to stop the sender. - -* **FLOW_OFF (Low Water Mark)**: When the RX FIFO usage drops below this - threshold, the MAC transmits a PAUSE frame with a quantum of zero to tell - the sender it can resume transmission. - -The PHY (Physical Layer Transceiver) ------------------------------------- -The PHY's role is distinct for each flow control mechanism: - -* **Link-wide PAUSE**: During the autonegotiation process, the PHY is - responsible for advertising the device's flow control capabilities. See the - "Link-wide PAUSE Autonegotiation Details" section for more information. - -* **Half-Duplex Collision-Based Flow Control**: The PHY is fundamental to the - CSMA/CD process. It performs carrier sensing (checking if the line is idle) - and collision detection, which is the mechanism leveraged to throttle the - sender. - -* **Priority-based Flow Control (PFC)**: The PHY is not directly involved in - negotiating PFC capabilities. Its role is to establish the physical link. - PFC negotiation happens at a higher layer via the Data Center Bridging - Capability Exchange Protocol (DCBX). - -User Space Interface -==================== -The primary user space tools are ``ethtool`` for link-wide PAUSE and ``dcb`` for -PFC. They communicate with the kernel to configure the network device driver -and underlying hardware. - -**Link-wide PAUSE Netlink Interface (``ethtool``)** - -See the ethtool Netlink spec (``Documentation/netlink/specs/ethtool.yaml``) -for the authoritative definition of the Pause control and Pause statistics -attributes. The generated UAPI is in -``include/uapi/linux/ethtool_netlink_generated.h``. - -**PFC Netlink Interface (``dcb``)** - -The authoritative definitions for DCB/PFC netlink attributes and commands are in -``include/uapi/linux/dcbnl.h``. See also the ``dcb(8)`` manual page and the DCB -subsystem documentation for userspace configuration details. - diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 52aafdc85f6a..c775cababc8c 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,7 +55,6 @@ Contents: eql fib_trie filter - flow_control generic-hdlc generic_netlink ../netlink/specs/index diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 40cc0a988d60..b0f2ef83735d 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -343,8 +343,16 @@ Some of the interface modes are described below: Pause frames / flow control =========================== -For detailed link-wide PAUSE and PFC behavior and configuration, see -flow_control.rst. +The PHY does not participate directly in flow control/pause frames except by +making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in +MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC +controller supports such a thing. Since flow control/pause frames generation +involves the Ethernet MAC driver, it is recommended that this driver takes care +of properly indicating advertisement and support for such features by setting +the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done +either before or after phy_connect() and/or as a result of implementing the +ethtool::set_pauseparam feature. + Keeping Close Tabs on the PAL ============================= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index eeed1ea50369..c2d8b4ec62eb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -953,48 +953,9 @@ struct kernel_ethtool_ts_info { * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. - * - * @get_pauseparam: Report the configured policy for link-wide PAUSE - * (IEEE 802.3 Annex 31B). Drivers must fill struct ethtool_pauseparam - * such that: - * @autoneg: - * This refers to **Pause Autoneg** (IEEE 802.3 Annex 31B) only - * and is independent of generic link autonegotiation configured - * via ethtool -s. - * true -> the device follows the negotiated result of pause - * autonegotiation (Pause/Asym); - * false -> the device uses a forced MAC state independent of - * negotiation. - * @rx_pause/@tx_pause: - * represent the desired policy (preferred configuration). - * In autoneg mode they describe what is to be advertised; - * in forced mode they describe the MAC state to apply. - * - * Drivers (and/or frameworks) should persist this policy across link - * changes and reapply appropriate MAC programming when link parameters - * change. - * - * @set_pauseparam: Apply a policy for link-wide PAUSE (IEEE 802.3 Annex 31B). - * If @autoneg is true: - * Arrange for pause advertisement (Pause/Asym) based on - * @rx_pause/@tx_pause and program the MAC to follow the - * negotiated result (which may be symmetric, asymmetric, or off - * depending on the link partner). - * If @autoneg is false: - * Do not rely on autonegotiation; force the MAC RX/TX pause - * state directly per @rx_pause/@tx_pause. - * - * Implementations that integrate with PHYLIB/PHYLINK should cooperate - * with those frameworks for advertisement and resolution; MAC drivers are - * still responsible for applying the required MAC state. - * - * Return: 0 on success or a negative errno. Return -EOPNOTSUPP if - * link-wide PAUSE is unsupported. If only symmetric pause is supported, - * reject unsupported asymmetric requests with -EINVAL (or document any - * coercion policy). - * - * See also: Documentation/networking/flow_control.rst - * + * @get_pauseparam: Report pause parameters + * @set_pauseparam: Set pause parameters. Returns a negative error code + * or zero. * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 3dd9d7cde86e..0e8ac0d974e2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -375,7 +375,7 @@ enum { ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -enum ethtool_a_pause_stat { +enum { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, @@ -385,7 +385,7 @@ enum ethtool_a_pause_stat { ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; -enum ethtool_a_pause { +enum { ETHTOOL_A_PAUSE_UNSPEC, ETHTOOL_A_PAUSE_HEADER, ETHTOOL_A_PAUSE_AUTONEG, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 91ee22f53774..03eb1d941fca 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -27,8 +27,6 @@ * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. - * See Documentation/networking/flow_control.rst for a high level description - * of the user space interface for Priority-based Flow Control (PFC). * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index eacf6a4859bf..0f9af1e66548 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* See Documentation/networking/flow_control.rst for a high level description of - * the userspace interface. - */ - #include "netlink.h" #include "common.h" -- cgit v1.2.3 From 5e1c88679174e4bfe5d152060b06d370bd85de80 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Mon, 4 Aug 2025 15:07:23 +0200 Subject: mfd: qnap-mcu: Include linux/types.h in qnap-mcu.h shared header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Relying on other components to include those basic types is unreliable and may cause compile errors like: ../include/linux/mfd/qnap-mcu.h:13:9: error: unknown type name ‘u32’ 13 | u32 baud_rate; | ^~~ ../include/linux/mfd/qnap-mcu.h:17:9: error: unknown type name ‘bool’ 17 | bool usb_led; | ^~~~ So make sure, the types used in the header are available. Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250804130726.3180806-2-heiko@sntech.de Signed-off-by: Lee Jones --- include/linux/mfd/qnap-mcu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/qnap-mcu.h b/include/linux/mfd/qnap-mcu.h index 8d48c212fd44..42bf523f9a5b 100644 --- a/include/linux/mfd/qnap-mcu.h +++ b/include/linux/mfd/qnap-mcu.h @@ -7,6 +7,8 @@ #ifndef _LINUX_QNAP_MCU_H_ #define _LINUX_QNAP_MCU_H_ +#include + struct qnap_mcu; struct qnap_mcu_variant { -- cgit v1.2.3 From 9c5ad8374b1fe0c8c9eaabc1146d96a543858c4a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 8 Aug 2025 17:17:56 +0200 Subject: mfd: arizona: Make legacy gpiolib interface optional The only machine that still uses the old gpio number based interface is the wlf_cragg_6410 board file. In order to remove the dependency on the interfaces, add #ifdef blocks here. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250808151822.536879-13-arnd@kernel.org Signed-off-by: Lee Jones --- drivers/mfd/arizona-irq.c | 5 ++++- include/linux/mfd/arizona/pdata.h | 6 ++++++ sound/soc/codecs/arizona-jack.c | 17 ++++++++++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c index 3f8622ee0e59..544016d420fe 100644 --- a/drivers/mfd/arizona-irq.c +++ b/drivers/mfd/arizona-irq.c @@ -136,7 +136,7 @@ static irqreturn_t arizona_irq_thread(int irq, void *data) dev_err(arizona->dev, "Failed to read main IRQ status: %d\n", ret); } - +#ifdef CONFIG_GPIOLIB_LEGACY /* * Poll the IRQ pin status to see if we're really done * if the interrupt controller can't do it for us. @@ -150,6 +150,7 @@ static irqreturn_t arizona_irq_thread(int irq, void *data) !gpio_get_value_cansleep(arizona->pdata.irq_gpio)) { poll = true; } +#endif } while (poll); pm_runtime_put_autosuspend(arizona->dev); @@ -349,6 +350,7 @@ int arizona_irq_init(struct arizona *arizona) goto err_map_main_irq; } +#ifdef CONFIG_GPIOLIB_LEGACY /* Used to emulate edge trigger and to work around broken pinmux */ if (arizona->pdata.irq_gpio) { if (gpio_to_irq(arizona->pdata.irq_gpio) != arizona->irq) { @@ -368,6 +370,7 @@ int arizona_irq_init(struct arizona *arizona) arizona->pdata.irq_gpio = 0; } } +#endif ret = request_threaded_irq(arizona->irq, NULL, arizona_irq_thread, flags, "arizona", arizona); diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index 2d13bbea4f3a..f72e6d4b14a7 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -117,8 +117,10 @@ struct arizona_pdata { /** Check for line output with HPDET method */ bool hpdet_acc_id_line; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO used for mic isolation with HPDET */ int hpdet_id_gpio; +#endif /** Channel to use for headphone detection */ unsigned int hpdet_channel; @@ -129,8 +131,10 @@ struct arizona_pdata { /** Extra debounce timeout used during initial mic detection (ms) */ unsigned int micd_detect_debounce; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO for mic detection polarity */ int micd_pol_gpio; +#endif /** Mic detect ramp rate */ unsigned int micd_bias_start_time; @@ -184,8 +188,10 @@ struct arizona_pdata { /** Haptic actuator type */ unsigned int hap_act; +#ifdef CONFIG_GPIOLIB_LEGACY /** GPIO for primary IRQ (used for edge triggered emulation) */ int irq_gpio; +#endif /** General purpose switch control */ unsigned int gpsw; diff --git a/sound/soc/codecs/arizona-jack.c b/sound/soc/codecs/arizona-jack.c index 22f9c431a0e5..6b55610ad535 100644 --- a/sound/soc/codecs/arizona-jack.c +++ b/sound/soc/codecs/arizona-jack.c @@ -461,7 +461,11 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, bool *mic) { struct arizona *arizona = info->arizona; +#ifdef CONFIG_GPIOLIB_LEGACY int id_gpio = arizona->pdata.hpdet_id_gpio; +#else + int id_gpio = 0; +#endif if (!arizona->pdata.hpdet_acc_id) return 0; @@ -472,6 +476,7 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, */ info->hpdet_res[info->num_hpdet_res++] = *reading; +#ifdef CONFIG_GPIOLIB_LEGACY /* Only check the mic directly if we didn't already ID it */ if (id_gpio && info->num_hpdet_res == 1) { dev_dbg(arizona->dev, "Measuring mic\n"); @@ -489,6 +494,7 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, ARIZONA_HP_POLL, ARIZONA_HP_POLL); return -EAGAIN; } +#endif /* OK, got both. Now, compare... */ dev_dbg(arizona->dev, "HPDET measured %d %d\n", @@ -529,7 +535,9 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data) { struct arizona_priv *info = data; struct arizona *arizona = info->arizona; +#ifdef CONFIG_GPIOLIB_LEGACY int id_gpio = arizona->pdata.hpdet_id_gpio; +#endif int ret, reading, state, report; bool mic = false; @@ -585,8 +593,10 @@ done: arizona_extcon_hp_clamp(info, false); +#ifdef CONFIG_GPIOLIB_LEGACY if (id_gpio) gpio_set_value_cansleep(id_gpio, 0); +#endif /* If we have a mic then reenable MICDET */ if (state && (mic || info->mic)) @@ -1317,6 +1327,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) regmap_update_bits(arizona->regmap, ARIZONA_GP_SWITCH_1, ARIZONA_SW1_MODE_MASK, arizona->pdata.gpsw); +#ifdef CONFIG_GPIOLIB_LEGACY if (pdata->micd_pol_gpio > 0) { if (info->micd_modes[0].gpio) mode = GPIOF_OUT_INIT_HIGH; @@ -1332,7 +1343,9 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) } info->micd_pol_gpio = gpio_to_desc(pdata->micd_pol_gpio); - } else { + } else +#endif + { if (info->micd_modes[0].gpio) mode = GPIOD_OUT_HIGH; else @@ -1353,6 +1366,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) } } +#ifdef CONFIG_GPIOLIB_LEGACY if (arizona->pdata.hpdet_id_gpio > 0) { ret = devm_gpio_request_one(dev, arizona->pdata.hpdet_id_gpio, GPIOF_OUT_INIT_LOW, @@ -1364,6 +1378,7 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) return ret; } } +#endif return 0; } -- cgit v1.2.3 From 719d02a25a24601c6fb8a7b1627e1abf015b6c2a Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 21 Aug 2025 20:23:34 +0200 Subject: mfd: bd71828, bd71815: Prepare for power-supply support Add core support for ROHM BD718(15/28/78) PMIC's charger blocks. Signed-off-by: Matti Vaittinen Signed-off-by: Andreas Kemnade Link: https://lore.kernel.org/r/20250821-bd71828-charger-v3-1-cc74ac4e0fb9@kemnade.info Signed-off-by: Lee Jones --- drivers/mfd/rohm-bd71828.c | 44 ++++++++++++++++++++++------ include/linux/mfd/rohm-bd71828.h | 63 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/mfd/rohm-bd71828.c b/drivers/mfd/rohm-bd71828.c index a14b7aa69c3c..84a64c3b9c9f 100644 --- a/drivers/mfd/rohm-bd71828.c +++ b/drivers/mfd/rohm-bd71828.c @@ -45,8 +45,8 @@ static const struct resource bd71828_rtc_irqs[] = { static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_RMV, "bd71815-dcin-rmv"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_OUT, "bd71815-clps-out"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_IN, "bd71815-clps-in"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_OUT, "bd71815-dcin-clps-out"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_CLPS_IN, "bd71815-dcin-clps-in"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_OVP_RES, "bd71815-dcin-ovp-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_OVP_DET, "bd71815-dcin-ovp-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_DCIN_MON_RES, "bd71815-dcin-mon-res"), @@ -56,7 +56,7 @@ static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_LOW_RES, "bd71815-vsys-low-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_LOW_DET, "bd71815-vsys-low-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_RES, "bd71815-vsys-mon-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_RES, "bd71815-vsys-mon-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_VSYS_MON_DET, "bd71815-vsys-mon-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_WDG_TEMP, "bd71815-chg-wdg-temp"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_WDG_TIME, "bd71815-chg-wdg"), DEFINE_RES_IRQ_NAMED(BD71815_INT_CHG_RECHARGE_RES, "bd71815-rechg-res"), @@ -87,10 +87,10 @@ static const struct resource bd71815_power_irqs[] = { DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_2_DET, "bd71815-bat-oc2-det"), DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_3_RES, "bd71815-bat-oc3-res"), DEFINE_RES_IRQ_NAMED(BD71815_INT_BAT_OVER_CURR_3_DET, "bd71815-bat-oc3-det"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_RES, "bd71815-bat-low-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_DET, "bd71815-bat-low-det"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_RES, "bd71815-bat-hi-res"), - DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_DET, "bd71815-bat-hi-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_RES, "bd71815-temp-bat-low-res"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_LOW_DET, "bd71815-temp-bat-low-det"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_RES, "bd71815-temp-bat-hi-res"), + DEFINE_RES_IRQ_NAMED(BD71815_INT_TEMP_BAT_HI_DET, "bd71815-temp-bat-hi-det"), }; static const struct mfd_cell bd71815_mfd_cells[] = { @@ -109,7 +109,30 @@ static const struct mfd_cell bd71815_mfd_cells[] = { }, }; -static const struct mfd_cell bd71828_mfd_cells[] = { +static const struct resource bd71828_power_irqs[] = { + DEFINE_RES_IRQ_NAMED(BD71828_INT_CHG_TOPOFF_TO_DONE, + "bd71828-chg-done"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_DCIN_DET, "bd71828-pwr-dcin-in"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_DCIN_RMV, "bd71828-pwr-dcin-out"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_BAT_LOW_VOLT_RES, + "bd71828-vbat-normal"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_BAT_LOW_VOLT_DET, "bd71828-vbat-low"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_HI_DET, "bd71828-btemp-hi"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_HI_RES, "bd71828-btemp-cool"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_LOW_DET, "bd71828-btemp-lo"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_BAT_LOW_RES, + "bd71828-btemp-warm"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_VF_DET, + "bd71828-temp-hi"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_VF_RES, + "bd71828-temp-norm"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_125_DET, + "bd71828-temp-125-over"), + DEFINE_RES_IRQ_NAMED(BD71828_INT_TEMP_CHIP_OVER_125_RES, + "bd71828-temp-125-under"), +}; + +static struct mfd_cell bd71828_mfd_cells[] = { { .name = "bd71828-pmic", }, { .name = "bd71828-gpio", }, { .name = "bd71828-led", .of_compatible = "rohm,bd71828-leds" }, @@ -118,8 +141,11 @@ static const struct mfd_cell bd71828_mfd_cells[] = { * BD70528 clock gate are the register address and mask. */ { .name = "bd71828-clk", }, - { .name = "bd71827-power", }, { + .name = "bd71828-power", + .resources = bd71828_power_irqs, + .num_resources = ARRAY_SIZE(bd71828_power_irqs), + }, { .name = "bd71828-rtc", .resources = bd71828_rtc_irqs, .num_resources = ARRAY_SIZE(bd71828_rtc_irqs), diff --git a/include/linux/mfd/rohm-bd71828.h b/include/linux/mfd/rohm-bd71828.h index ce786c96404a..73a71ef69152 100644 --- a/include/linux/mfd/rohm-bd71828.h +++ b/include/linux/mfd/rohm-bd71828.h @@ -189,6 +189,69 @@ enum { /* Charger/Battey */ #define BD71828_REG_CHG_STATE 0x65 #define BD71828_REG_CHG_FULL 0xd2 +#define BD71828_REG_CHG_EN 0x6F +#define BD71828_REG_DCIN_STAT 0x68 +#define BD71828_MASK_DCIN_DET 0x01 +#define BD71828_REG_VDCIN_U 0x9c +#define BD71828_MASK_CHG_EN 0x01 +#define BD71828_CHG_MASK_DCIN_U 0x0f +#define BD71828_REG_BAT_STAT 0x67 +#define BD71828_REG_BAT_TEMP 0x6c +#define BD71828_MASK_BAT_TEMP 0x07 +#define BD71828_BAT_TEMP_OPEN 0x07 +#define BD71828_MASK_BAT_DET 0x20 +#define BD71828_MASK_BAT_DET_DONE 0x10 +#define BD71828_REG_CHG_STATE 0x65 +#define BD71828_REG_VBAT_U 0x8c +#define BD71828_MASK_VBAT_U 0x0f +#define BD71828_REG_VBAT_REX_AVG_U 0x92 + +#define BD71828_REG_OCV_PWRON_U 0x8A + +#define BD71828_REG_VBAT_MIN_AVG_U 0x8e +#define BD71828_REG_VBAT_MIN_AVG_L 0x8f + +#define BD71828_REG_CC_CNT3 0xb5 +#define BD71828_REG_CC_CNT2 0xb6 +#define BD71828_REG_CC_CNT1 0xb7 +#define BD71828_REG_CC_CNT0 0xb8 +#define BD71828_REG_CC_CURCD_AVG_U 0xb2 +#define BD71828_MASK_CC_CURCD_AVG_U 0x3f +#define BD71828_MASK_CC_CUR_DIR 0x80 +#define BD71828_REG_VM_BTMP_U 0xa1 +#define BD71828_REG_VM_BTMP_L 0xa2 +#define BD71828_MASK_VM_BTMP_U 0x0f +#define BD71828_REG_COULOMB_CTRL 0xc4 +#define BD71828_REG_COULOMB_CTRL2 0xd2 +#define BD71828_MASK_REX_CC_CLR 0x01 +#define BD71828_MASK_FULL_CC_CLR 0x10 +#define BD71828_REG_CC_CNT_FULL3 0xbd +#define BD71828_REG_CC_CNT_CHG3 0xc1 + +#define BD71828_REG_VBAT_INITIAL1_U 0x86 +#define BD71828_REG_VBAT_INITIAL1_L 0x87 + +#define BD71828_REG_VBAT_INITIAL2_U 0x88 +#define BD71828_REG_VBAT_INITIAL2_L 0x89 + +#define BD71828_REG_IBAT_U 0xb0 +#define BD71828_REG_IBAT_L 0xb1 + +#define BD71828_REG_IBAT_AVG_U 0xb2 +#define BD71828_REG_IBAT_AVG_L 0xb3 + +#define BD71828_REG_VSYS_AVG_U 0x96 +#define BD71828_REG_VSYS_AVG_L 0x97 +#define BD71828_REG_VSYS_MIN_AVG_U 0x98 +#define BD71828_REG_VSYS_MIN_AVG_L 0x99 +#define BD71828_REG_CHG_SET1 0x75 +#define BD71828_REG_ALM_VBAT_LIMIT_U 0xaa +#define BD71828_REG_BATCAP_MON_LIMIT_U 0xcc +#define BD71828_REG_CONF 0x64 + +#define BD71828_REG_DCIN_CLPS 0x71 + +#define BD71828_REG_MEAS_CLEAR 0xaf /* LEDs */ #define BD71828_REG_LED_CTRL 0x4A -- cgit v1.2.3 From 7d096cb3e16f09d9762ae8cef897cdbc13a40029 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 21 Aug 2025 14:46:33 +0800 Subject: virtio_ring: constify virtqueue pointer for DMA helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch constifies the virtqueue pointer for DMA helpers. Reviewed-by: Christoph Hellwig Reviewed-by: Xuan Zhuo Reviewed-by: Eugenio Pérez Signed-off-by: Jason Wang Message-Id: <20250821064641.5025-2-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Eugenio Pérez --- drivers/virtio/virtio_ring.c | 25 +++++++++++++------------ include/linux/virtio.h | 12 ++++++------ 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index f5062061c408..103bad8cffff 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -3149,12 +3149,12 @@ EXPORT_SYMBOL_GPL(virtqueue_get_vring); * * return DMA address. Caller should check that by virtqueue_dma_mapping_error(). */ -dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, +dma_addr_t virtqueue_dma_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) { kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, dir); @@ -3176,11 +3176,12 @@ EXPORT_SYMBOL_GPL(virtqueue_dma_map_single_attrs); * Unmap the address that is mapped by the virtqueue_dma_map_* APIs. * */ -void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr, +void virtqueue_dma_unmap_single_attrs(const struct virtqueue *_vq, + dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return; @@ -3196,9 +3197,9 @@ EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs); * * Returns 0 means dma valid. Other means invalid dma address. */ -int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr) +int virtqueue_dma_mapping_error(const struct virtqueue *_vq, dma_addr_t addr) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return 0; @@ -3217,9 +3218,9 @@ EXPORT_SYMBOL_GPL(virtqueue_dma_mapping_error); * * return bool */ -bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr) +bool virtqueue_dma_need_sync(const struct virtqueue *_vq, dma_addr_t addr) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return false; @@ -3240,12 +3241,12 @@ EXPORT_SYMBOL_GPL(virtqueue_dma_need_sync); * the DMA address really needs to be synchronized * */ -void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq, +void virtqueue_dma_sync_single_range_for_cpu(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); if (!vq->use_dma_api) @@ -3266,12 +3267,12 @@ EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_cpu); * Before calling this function, use virtqueue_dma_need_sync() to confirm that * the DMA address really needs to be synchronized */ -void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, +void virtqueue_dma_sync_single_range_for_device(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { - struct vring_virtqueue *vq = to_vvq(_vq); + const struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); if (!vq->use_dma_api) diff --git a/include/linux/virtio.h b/include/linux/virtio.h index db31fc6f4f1f..eab71a440fba 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -262,18 +262,18 @@ void unregister_virtio_driver(struct virtio_driver *drv); module_driver(__virtio_driver, register_virtio_driver, \ unregister_virtio_driver) -dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, size_t size, +dma_addr_t virtqueue_dma_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs); -void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr, +void virtqueue_dma_unmap_single_attrs(const struct virtqueue *_vq, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); -int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr); +int virtqueue_dma_mapping_error(const struct virtqueue *_vq, dma_addr_t addr); -bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr); -void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq, dma_addr_t addr, +bool virtqueue_dma_need_sync(const struct virtqueue *_vq, dma_addr_t addr); +void virtqueue_dma_sync_single_range_for_cpu(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir); -void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, dma_addr_t addr, +void virtqueue_dma_sync_single_range_for_device(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir); -- cgit v1.2.3 From b41cb3bcf67fcb7b8297e5acc5bb3309c96c2ff2 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 21 Aug 2025 14:46:35 +0800 Subject: virtio: rename dma helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following patch will introduce virtio mapping function to avoid abusing DMA API for device that doesn't do DMA. To ease the introduction, this patch rename "dma" to "map" for the current dma mapping helpers. Reviewed-by: Christoph Hellwig Reviewed-by: Xuan Zhuo Signed-off-by: Jason Wang Message-Id: <20250821064641.5025-4-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Eugenio Pérez --- drivers/net/virtio_net.c | 28 +++++------ drivers/virtio/virtio_ring.c | 114 +++++++++++++++++++++---------------------- include/linux/virtio.h | 12 ++--- 3 files changed, 77 insertions(+), 77 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 975bdc5dab84..31bd32bdecaf 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -962,7 +962,7 @@ static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len) if (dma->need_sync && len) { offset = buf - (head + sizeof(*dma)); - virtqueue_dma_sync_single_range_for_cpu(rq->vq, dma->addr, + virtqueue_map_sync_single_range_for_cpu(rq->vq, dma->addr, offset, len, DMA_FROM_DEVICE); } @@ -970,8 +970,8 @@ static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len) if (dma->ref) return; - virtqueue_dma_unmap_single_attrs(rq->vq, dma->addr, dma->len, - DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); + virtqueue_unmap_single_attrs(rq->vq, dma->addr, dma->len, + DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); put_page(page); } @@ -1038,13 +1038,13 @@ static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp) dma->len = alloc_frag->size - sizeof(*dma); - addr = virtqueue_dma_map_single_attrs(rq->vq, dma + 1, - dma->len, DMA_FROM_DEVICE, 0); - if (virtqueue_dma_mapping_error(rq->vq, addr)) + addr = virtqueue_map_single_attrs(rq->vq, dma + 1, + dma->len, DMA_FROM_DEVICE, 0); + if (virtqueue_map_mapping_error(rq->vq, addr)) return NULL; dma->addr = addr; - dma->need_sync = virtqueue_dma_need_sync(rq->vq, addr); + dma->need_sync = virtqueue_map_need_sync(rq->vq, addr); /* Add a reference to dma to prevent the entire dma from * being released during error handling. This reference @@ -5952,9 +5952,9 @@ static int virtnet_xsk_pool_enable(struct net_device *dev, if (!rq->xsk_buffs) return -ENOMEM; - hdr_dma = virtqueue_dma_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, - DMA_TO_DEVICE, 0); - if (virtqueue_dma_mapping_error(sq->vq, hdr_dma)) { + hdr_dma = virtqueue_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, + DMA_TO_DEVICE, 0); + if (virtqueue_map_mapping_error(sq->vq, hdr_dma)) { err = -ENOMEM; goto err_free_buffs; } @@ -5983,8 +5983,8 @@ err_sq: err_rq: xsk_pool_dma_unmap(pool, 0); err_xsk_map: - virtqueue_dma_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, - DMA_TO_DEVICE, 0); + virtqueue_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, + DMA_TO_DEVICE, 0); err_free_buffs: kvfree(rq->xsk_buffs); return err; @@ -6011,8 +6011,8 @@ static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid) xsk_pool_dma_unmap(pool, 0); - virtqueue_dma_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr, - vi->hdr_len, DMA_TO_DEVICE, 0); + virtqueue_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr, + vi->hdr_len, DMA_TO_DEVICE, 0); kvfree(rq->xsk_buffs); return err; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 75e5f6336c8d..482a268af851 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -166,7 +166,7 @@ struct vring_virtqueue { bool packed_ring; /* Is DMA API used? */ - bool use_dma_api; + bool use_map_api; /* Can we use weak barriers? */ bool weak_barriers; @@ -268,7 +268,7 @@ static bool virtqueue_use_indirect(const struct vring_virtqueue *vq, * unconditionally on data path. */ -static bool vring_use_dma_api(const struct virtio_device *vdev) +static bool vring_use_map_api(const struct virtio_device *vdev) { if (!virtio_has_dma_quirk(vdev)) return true; @@ -291,14 +291,14 @@ static bool vring_use_dma_api(const struct virtio_device *vdev) static bool vring_need_unmap_buffer(const struct vring_virtqueue *vring, const struct vring_desc_extra *extra) { - return vring->use_dma_api && (extra->addr != DMA_MAPPING_ERROR); + return vring->use_map_api && (extra->addr != DMA_MAPPING_ERROR); } size_t virtio_max_dma_size(const struct virtio_device *vdev) { size_t max_segment_size = SIZE_MAX; - if (vring_use_dma_api(vdev)) + if (vring_use_map_api(vdev)) max_segment_size = dma_max_mapping_size(vdev->dev.parent); return max_segment_size; @@ -309,7 +309,7 @@ static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, dma_addr_t *dma_handle, gfp_t flag, struct device *dma_dev) { - if (vring_use_dma_api(vdev)) { + if (vring_use_map_api(vdev)) { return dma_alloc_coherent(dma_dev, size, dma_handle, flag); } else { @@ -343,7 +343,7 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size, void *queue, dma_addr_t dma_handle, struct device *dma_dev) { - if (vring_use_dma_api(vdev)) + if (vring_use_map_api(vdev)) dma_free_coherent(dma_dev, size, queue, dma_handle); else free_pages_exact(queue, PAGE_ALIGN(size)); @@ -372,7 +372,7 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *len = sg->length; - if (!vq->use_dma_api) { + if (!vq->use_map_api) { /* * If DMA is not used, KMSAN doesn't know that the scatterlist * is initialized by the hardware. Explicitly check/unpoison it @@ -402,17 +402,17 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, void *cpu_addr, size_t size, enum dma_data_direction direction) { - if (!vq->use_dma_api) + if (!vq->use_map_api) return (dma_addr_t)virt_to_phys(cpu_addr); - return virtqueue_dma_map_single_attrs(&vq->vq, cpu_addr, - size, direction, 0); + return virtqueue_map_single_attrs(&vq->vq, cpu_addr, + size, direction, 0); } static int vring_mapping_error(const struct vring_virtqueue *vq, dma_addr_t addr) { - if (!vq->use_dma_api) + if (!vq->use_map_api) return 0; return dma_mapping_error(vring_dma_dev(vq), addr); @@ -449,7 +449,7 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, flags = extra->flags; if (flags & VRING_DESC_F_INDIRECT) { - if (!vq->use_dma_api) + if (!vq->use_map_api) goto out; } else if (!vring_need_unmap_buffer(vq, extra)) goto out; @@ -782,7 +782,7 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, extra = (struct vring_desc_extra *)&indir_desc[num]; - if (vq->use_dma_api) { + if (vq->use_map_api) { for (j = 0; j < num; j++) vring_unmap_one_split(vq, &extra[j]); } @@ -1150,7 +1150,7 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, vq->broken = false; #endif vq->dma_dev = dma_dev; - vq->use_dma_api = vring_use_dma_api(vdev); + vq->use_map_api = vring_use_map_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -1266,7 +1266,7 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, flags = extra->flags; if (flags & VRING_DESC_F_INDIRECT) { - if (!vq->use_dma_api) + if (!vq->use_map_api) return; } else if (!vring_need_unmap_buffer(vq, extra)) return; @@ -1351,7 +1351,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, desc[i].addr = cpu_to_le64(addr); desc[i].len = cpu_to_le32(len); - if (unlikely(vq->use_dma_api)) { + if (unlikely(vq->use_map_api)) { extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr; extra[i].len = len; extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE; @@ -1373,7 +1373,7 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, sizeof(struct vring_packed_desc)); vq->packed.vring.desc[head].id = cpu_to_le16(id); - if (vq->use_dma_api) { + if (vq->use_map_api) { vq->packed.desc_extra[id].addr = addr; vq->packed.desc_extra[id].len = total_sg * sizeof(struct vring_packed_desc); @@ -1515,7 +1515,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, desc[i].len = cpu_to_le32(len); desc[i].id = cpu_to_le16(id); - if (unlikely(vq->use_dma_api)) { + if (unlikely(vq->use_map_api)) { vq->packed.desc_extra[curr].addr = premapped ? DMA_MAPPING_ERROR : addr; vq->packed.desc_extra[curr].len = len; @@ -1650,7 +1650,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq, vq->free_head = id; vq->vq.num_free += state->num; - if (unlikely(vq->use_dma_api)) { + if (unlikely(vq->use_map_api)) { curr = id; for (i = 0; i < state->num; i++) { vring_unmap_extra_packed(vq, @@ -1668,7 +1668,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq, if (!desc) return; - if (vq->use_dma_api) { + if (vq->use_map_api) { len = vq->packed.desc_extra[id].len; num = len / sizeof(struct vring_packed_desc); @@ -2121,7 +2121,7 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, #endif vq->packed_ring = true; vq->dma_dev = dma_dev; - vq->use_dma_api = vring_use_dma_api(vdev); + vq->use_map_api = vring_use_map_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; @@ -2433,7 +2433,7 @@ struct device *virtqueue_dma_dev(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - if (vq->use_dma_api) + if (vq->use_map_api) return vring_dma_dev(vq); else return NULL; @@ -3122,7 +3122,7 @@ const struct vring *virtqueue_get_vring(const struct virtqueue *vq) EXPORT_SYMBOL_GPL(virtqueue_get_vring); /** - * virtqueue_dma_map_single_attrs - map DMA for _vq + * virtqueue_map_single_attrs - map DMA for _vq * @_vq: the struct virtqueue we're talking about. * @ptr: the pointer of the buffer to do dma * @size: the size of the buffer to do dma @@ -3132,16 +3132,16 @@ EXPORT_SYMBOL_GPL(virtqueue_get_vring); * The caller calls this to do dma mapping in advance. The DMA address can be * passed to this _vq when it is in pre-mapped mode. * - * return DMA address. Caller should check that by virtqueue_dma_mapping_error(). + * return DMA address. Caller should check that by virtqueue_mapping_error(). */ -dma_addr_t virtqueue_dma_map_single_attrs(const struct virtqueue *_vq, void *ptr, - size_t size, - enum dma_data_direction dir, - unsigned long attrs) +dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) { const struct vring_virtqueue *vq = to_vvq(_vq); - if (!vq->use_dma_api) { + if (!vq->use_map_api) { kmsan_handle_dma(virt_to_page(ptr), offset_in_page(ptr), size, dir); return (dma_addr_t)virt_to_phys(ptr); } @@ -3154,85 +3154,85 @@ dma_addr_t virtqueue_dma_map_single_attrs(const struct virtqueue *_vq, void *ptr return dma_map_page_attrs(vring_dma_dev(vq), virt_to_page(ptr), offset_in_page(ptr), size, dir, attrs); } -EXPORT_SYMBOL_GPL(virtqueue_dma_map_single_attrs); +EXPORT_SYMBOL_GPL(virtqueue_map_single_attrs); /** - * virtqueue_dma_unmap_single_attrs - unmap DMA for _vq + * virtqueue_unmap_single_attrs - unmap map for _vq * @_vq: the struct virtqueue we're talking about. * @addr: the dma address to unmap * @size: the size of the buffer * @dir: DMA direction * @attrs: DMA Attrs * - * Unmap the address that is mapped by the virtqueue_dma_map_* APIs. + * Unmap the address that is mapped by the virtqueue_map_* APIs. * */ -void virtqueue_dma_unmap_single_attrs(const struct virtqueue *_vq, - dma_addr_t addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) +void virtqueue_unmap_single_attrs(const struct virtqueue *_vq, + dma_addr_t addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) { const struct vring_virtqueue *vq = to_vvq(_vq); - if (!vq->use_dma_api) + if (!vq->use_map_api) return; dma_unmap_page_attrs(vring_dma_dev(vq), addr, size, dir, attrs); } -EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs); +EXPORT_SYMBOL_GPL(virtqueue_unmap_single_attrs); /** - * virtqueue_dma_mapping_error - check dma address + * virtqueue_map_mapping_error - check dma address * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * * Returns 0 means dma valid. Other means invalid dma address. */ -int virtqueue_dma_mapping_error(const struct virtqueue *_vq, dma_addr_t addr) +int virtqueue_map_mapping_error(const struct virtqueue *_vq, dma_addr_t addr) { const struct vring_virtqueue *vq = to_vvq(_vq); - if (!vq->use_dma_api) + if (!vq->use_map_api) return 0; return dma_mapping_error(vring_dma_dev(vq), addr); } -EXPORT_SYMBOL_GPL(virtqueue_dma_mapping_error); +EXPORT_SYMBOL_GPL(virtqueue_map_mapping_error); /** - * virtqueue_dma_need_sync - check a dma address needs sync + * virtqueue_map_need_sync - check a dma address needs sync * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * - * Check if the dma address mapped by the virtqueue_dma_map_* APIs needs to be + * Check if the dma address mapped by the virtqueue_map_* APIs needs to be * synchronized * * return bool */ -bool virtqueue_dma_need_sync(const struct virtqueue *_vq, dma_addr_t addr) +bool virtqueue_map_need_sync(const struct virtqueue *_vq, dma_addr_t addr) { const struct vring_virtqueue *vq = to_vvq(_vq); - if (!vq->use_dma_api) + if (!vq->use_map_api) return false; return dma_need_sync(vring_dma_dev(vq), addr); } -EXPORT_SYMBOL_GPL(virtqueue_dma_need_sync); +EXPORT_SYMBOL_GPL(virtqueue_map_need_sync); /** - * virtqueue_dma_sync_single_range_for_cpu - dma sync for cpu + * virtqueue_map_sync_single_range_for_cpu - map sync for cpu * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * @offset: DMA address offset * @size: buf size for sync * @dir: DMA direction * - * Before calling this function, use virtqueue_dma_need_sync() to confirm that + * Before calling this function, use virtqueue_map_need_sync() to confirm that * the DMA address really needs to be synchronized * */ -void virtqueue_dma_sync_single_range_for_cpu(const struct virtqueue *_vq, +void virtqueue_map_sync_single_range_for_cpu(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) @@ -3240,25 +3240,25 @@ void virtqueue_dma_sync_single_range_for_cpu(const struct virtqueue *_vq, const struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); - if (!vq->use_dma_api) + if (!vq->use_map_api) return; dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); } -EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_cpu); +EXPORT_SYMBOL_GPL(virtqueue_map_sync_single_range_for_cpu); /** - * virtqueue_dma_sync_single_range_for_device - dma sync for device + * virtqueue_map_sync_single_range_for_device - map sync for device * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * @offset: DMA address offset * @size: buf size for sync * @dir: DMA direction * - * Before calling this function, use virtqueue_dma_need_sync() to confirm that + * Before calling this function, use virtqueue_map_need_sync() to confirm that * the DMA address really needs to be synchronized */ -void virtqueue_dma_sync_single_range_for_device(const struct virtqueue *_vq, +void virtqueue_map_sync_single_range_for_device(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) @@ -3266,12 +3266,12 @@ void virtqueue_dma_sync_single_range_for_device(const struct virtqueue *_vq, const struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); - if (!vq->use_dma_api) + if (!vq->use_map_api) return; dma_sync_single_range_for_device(dev, addr, offset, size, dir); } -EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_device); +EXPORT_SYMBOL_GPL(virtqueue_map_sync_single_range_for_device); MODULE_DESCRIPTION("Virtio ring implementation"); MODULE_LICENSE("GPL"); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index eab71a440fba..576e08bd7697 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -262,18 +262,18 @@ void unregister_virtio_driver(struct virtio_driver *drv); module_driver(__virtio_driver, register_virtio_driver, \ unregister_virtio_driver) -dma_addr_t virtqueue_dma_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, +dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs); -void virtqueue_dma_unmap_single_attrs(const struct virtqueue *_vq, dma_addr_t addr, +void virtqueue_unmap_single_attrs(const struct virtqueue *_vq, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); -int virtqueue_dma_mapping_error(const struct virtqueue *_vq, dma_addr_t addr); +int virtqueue_map_mapping_error(const struct virtqueue *_vq, dma_addr_t addr); -bool virtqueue_dma_need_sync(const struct virtqueue *_vq, dma_addr_t addr); -void virtqueue_dma_sync_single_range_for_cpu(const struct virtqueue *_vq, dma_addr_t addr, +bool virtqueue_map_need_sync(const struct virtqueue *_vq, dma_addr_t addr); +void virtqueue_map_sync_single_range_for_cpu(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir); -void virtqueue_dma_sync_single_range_for_device(const struct virtqueue *_vq, dma_addr_t addr, +void virtqueue_map_sync_single_range_for_device(const struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir); -- cgit v1.2.3 From b16060c5c7d56455da3c3c50b4a20a83c2a30810 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 21 Aug 2025 14:46:36 +0800 Subject: virtio: introduce virtio_map container union MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following patch will introduce the mapping operations for virtio device. In order to achieve this, besides the dma device, virtio core needs to support a transport or device specific mapping metadata as well. So this patch introduces a union container of a dma device. The idea is the allow the transport layer to pass device specific mapping metadata which will be used as a parameter for the virtio mapping operations. For the transport or device that is using DMA, dma device is still being used. Signed-off-by: Jason Wang Message-Id: <20250821064641.5025-5-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Eugenio Pérez --- drivers/virtio/virtio_ring.c | 104 ++++++++++++++++++++++--------------------- drivers/virtio/virtio_vdpa.c | 6 ++- include/linux/virtio.h | 5 +++ include/linux/virtio_ring.h | 7 +-- 4 files changed, 66 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 482a268af851..cda9bc2121bf 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -210,8 +210,7 @@ struct vring_virtqueue { /* DMA, allocation, and size information */ bool we_own_ring; - /* Device used for doing DMA */ - struct device *dma_dev; + union virtio_map map; #ifdef DEBUG /* They're supposed to lock for us. */ @@ -307,10 +306,10 @@ EXPORT_SYMBOL_GPL(virtio_max_dma_size); static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, dma_addr_t *dma_handle, gfp_t flag, - struct device *dma_dev) + union virtio_map map) { if (vring_use_map_api(vdev)) { - return dma_alloc_coherent(dma_dev, size, + return dma_alloc_coherent(map.dma_dev, size, dma_handle, flag); } else { void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag); @@ -341,10 +340,10 @@ static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, static void vring_free_queue(struct virtio_device *vdev, size_t size, void *queue, dma_addr_t dma_handle, - struct device *dma_dev) + union virtio_map map) { if (vring_use_map_api(vdev)) - dma_free_coherent(dma_dev, size, queue, dma_handle); + dma_free_coherent(map.dma_dev, size, queue, dma_handle); else free_pages_exact(queue, PAGE_ALIGN(size)); } @@ -356,7 +355,7 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size, */ static struct device *vring_dma_dev(const struct vring_virtqueue *vq) { - return vq->dma_dev; + return vq->map.dma_dev; } /* Map one sg entry. */ @@ -1056,12 +1055,13 @@ err_state: } static void vring_free_split(struct vring_virtqueue_split *vring_split, - struct virtio_device *vdev, struct device *dma_dev) + struct virtio_device *vdev, + union virtio_map map) { vring_free_queue(vdev, vring_split->queue_size_in_bytes, vring_split->vring.desc, vring_split->queue_dma_addr, - dma_dev); + map); kfree(vring_split->desc_state); kfree(vring_split->desc_extra); @@ -1072,7 +1072,7 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, u32 num, unsigned int vring_align, bool may_reduce_num, - struct device *dma_dev) + union virtio_map map) { void *queue = NULL; dma_addr_t dma_addr; @@ -1088,7 +1088,7 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, queue = vring_alloc_queue(vdev, vring_size(num, vring_align), &dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, - dma_dev); + map); if (queue) break; if (!may_reduce_num) @@ -1102,7 +1102,7 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, /* Try to get a single page. You are my only hope! */ queue = vring_alloc_queue(vdev, vring_size(num, vring_align), &dma_addr, GFP_KERNEL | __GFP_ZERO, - dma_dev); + map); } if (!queue) return -ENOMEM; @@ -1126,7 +1126,7 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, - struct device *dma_dev) + union virtio_map map) { struct vring_virtqueue *vq; int err; @@ -1149,7 +1149,7 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, #else vq->broken = false; #endif - vq->dma_dev = dma_dev; + vq->map = map; vq->use_map_api = vring_use_map_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && @@ -1187,21 +1187,21 @@ static struct virtqueue *vring_create_virtqueue_split( bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, - struct device *dma_dev) + union virtio_map map) { struct vring_virtqueue_split vring_split = {}; struct virtqueue *vq; int err; err = vring_alloc_queue_split(&vring_split, vdev, num, vring_align, - may_reduce_num, dma_dev); + may_reduce_num, map); if (err) return NULL; vq = __vring_new_virtqueue_split(index, &vring_split, vdev, weak_barriers, - context, notify, callback, name, dma_dev); + context, notify, callback, name, map); if (!vq) { - vring_free_split(&vring_split, vdev, dma_dev); + vring_free_split(&vring_split, vdev, map); return NULL; } @@ -1220,7 +1220,7 @@ static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) err = vring_alloc_queue_split(&vring_split, vdev, num, vq->split.vring_align, vq->split.may_reduce_num, - vring_dma_dev(vq)); + vq->map); if (err) goto err; @@ -1238,7 +1238,7 @@ static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) return 0; err_state_extra: - vring_free_split(&vring_split, vdev, vring_dma_dev(vq)); + vring_free_split(&vring_split, vdev, vq->map); err: virtqueue_reinit_split(vq); return -ENOMEM; @@ -1947,25 +1947,25 @@ static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num) static void vring_free_packed(struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, - struct device *dma_dev) + union virtio_map map) { if (vring_packed->vring.desc) vring_free_queue(vdev, vring_packed->ring_size_in_bytes, vring_packed->vring.desc, vring_packed->ring_dma_addr, - dma_dev); + map); if (vring_packed->vring.driver) vring_free_queue(vdev, vring_packed->event_size_in_bytes, vring_packed->vring.driver, vring_packed->driver_event_dma_addr, - dma_dev); + map); if (vring_packed->vring.device) vring_free_queue(vdev, vring_packed->event_size_in_bytes, vring_packed->vring.device, vring_packed->device_event_dma_addr, - dma_dev); + map); kfree(vring_packed->desc_state); kfree(vring_packed->desc_extra); @@ -1973,7 +1973,7 @@ static void vring_free_packed(struct vring_virtqueue_packed *vring_packed, static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, - u32 num, struct device *dma_dev) + u32 num, union virtio_map map) { struct vring_packed_desc *ring; struct vring_packed_desc_event *driver, *device; @@ -1985,7 +1985,7 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, ring = vring_alloc_queue(vdev, ring_size_in_bytes, &ring_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, - dma_dev); + map); if (!ring) goto err; @@ -1998,7 +1998,7 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, driver = vring_alloc_queue(vdev, event_size_in_bytes, &driver_event_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, - dma_dev); + map); if (!driver) goto err; @@ -2009,7 +2009,7 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, device = vring_alloc_queue(vdev, event_size_in_bytes, &device_event_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, - dma_dev); + map); if (!device) goto err; @@ -2021,7 +2021,7 @@ static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, return 0; err: - vring_free_packed(vring_packed, vdev, dma_dev); + vring_free_packed(vring_packed, vdev, map); return -ENOMEM; } @@ -2097,7 +2097,7 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, - struct device *dma_dev) + union virtio_map map) { struct vring_virtqueue *vq; int err; @@ -2120,7 +2120,7 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, vq->broken = false; #endif vq->packed_ring = true; - vq->dma_dev = dma_dev; + vq->map = map; vq->use_map_api = vring_use_map_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && @@ -2158,18 +2158,18 @@ static struct virtqueue *vring_create_virtqueue_packed( bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, - struct device *dma_dev) + union virtio_map map) { struct vring_virtqueue_packed vring_packed = {}; struct virtqueue *vq; - if (vring_alloc_queue_packed(&vring_packed, vdev, num, dma_dev)) + if (vring_alloc_queue_packed(&vring_packed, vdev, num, map)) return NULL; vq = __vring_new_virtqueue_packed(index, &vring_packed, vdev, weak_barriers, - context, notify, callback, name, dma_dev); + context, notify, callback, name, map); if (!vq) { - vring_free_packed(&vring_packed, vdev, dma_dev); + vring_free_packed(&vring_packed, vdev, map); return NULL; } @@ -2185,7 +2185,7 @@ static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) struct virtio_device *vdev = _vq->vdev; int err; - if (vring_alloc_queue_packed(&vring_packed, vdev, num, vring_dma_dev(vq))) + if (vring_alloc_queue_packed(&vring_packed, vdev, num, vq->map)) goto err_ring; err = vring_alloc_state_extra_packed(&vring_packed); @@ -2202,7 +2202,7 @@ static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) return 0; err_state_extra: - vring_free_packed(&vring_packed, vdev, vring_dma_dev(vq)); + vring_free_packed(&vring_packed, vdev, vq->map); err_ring: virtqueue_reinit_packed(vq); return -ENOMEM; @@ -2434,7 +2434,7 @@ struct device *virtqueue_dma_dev(struct virtqueue *_vq) struct vring_virtqueue *vq = to_vvq(_vq); if (vq->use_map_api) - return vring_dma_dev(vq); + return vq->map.dma_dev; else return NULL; } @@ -2719,19 +2719,20 @@ struct virtqueue *vring_create_virtqueue( void (*callback)(struct virtqueue *), const char *name) { + union virtio_map map = {.dma_dev = vdev->dev.parent}; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return vring_create_virtqueue_packed(index, num, vring_align, vdev, weak_barriers, may_reduce_num, - context, notify, callback, name, vdev->dev.parent); + context, notify, callback, name, map); return vring_create_virtqueue_split(index, num, vring_align, vdev, weak_barriers, may_reduce_num, - context, notify, callback, name, vdev->dev.parent); + context, notify, callback, name, map); } EXPORT_SYMBOL_GPL(vring_create_virtqueue); -struct virtqueue *vring_create_virtqueue_dma( +struct virtqueue *vring_create_virtqueue_map( unsigned int index, unsigned int num, unsigned int vring_align, @@ -2742,19 +2743,19 @@ struct virtqueue *vring_create_virtqueue_dma( bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, - struct device *dma_dev) + union virtio_map map) { if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return vring_create_virtqueue_packed(index, num, vring_align, vdev, weak_barriers, may_reduce_num, - context, notify, callback, name, dma_dev); + context, notify, callback, name, map); return vring_create_virtqueue_split(index, num, vring_align, vdev, weak_barriers, may_reduce_num, - context, notify, callback, name, dma_dev); + context, notify, callback, name, map); } -EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma); +EXPORT_SYMBOL_GPL(vring_create_virtqueue_map); /** * virtqueue_resize - resize the vring of vq @@ -2865,6 +2866,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, const char *name) { struct vring_virtqueue_split vring_split = {}; + union virtio_map map = {.dma_dev = vdev->dev.parent}; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) { struct vring_virtqueue_packed vring_packed = {}; @@ -2874,13 +2876,13 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, return __vring_new_virtqueue_packed(index, &vring_packed, vdev, weak_barriers, context, notify, callback, - name, vdev->dev.parent); + name, map); } vring_init(&vring_split.vring, num, pages, vring_align); return __vring_new_virtqueue_split(index, &vring_split, vdev, weak_barriers, context, notify, callback, name, - vdev->dev.parent); + map); } EXPORT_SYMBOL_GPL(vring_new_virtqueue); @@ -2894,19 +2896,19 @@ static void vring_free(struct virtqueue *_vq) vq->packed.ring_size_in_bytes, vq->packed.vring.desc, vq->packed.ring_dma_addr, - vring_dma_dev(vq)); + vq->map); vring_free_queue(vq->vq.vdev, vq->packed.event_size_in_bytes, vq->packed.vring.driver, vq->packed.driver_event_dma_addr, - vring_dma_dev(vq)); + vq->map); vring_free_queue(vq->vq.vdev, vq->packed.event_size_in_bytes, vq->packed.vring.device, vq->packed.device_event_dma_addr, - vring_dma_dev(vq)); + vq->map); kfree(vq->packed.desc_state); kfree(vq->packed.desc_extra); @@ -2915,7 +2917,7 @@ static void vring_free(struct virtqueue *_vq) vq->split.queue_size_in_bytes, vq->split.vring.desc, vq->split.queue_dma_addr, - vring_dma_dev(vq)); + vq->map); } } if (!vq->packed_ring) { diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 657b07a60788..dc557aa7c825 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -139,6 +139,7 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, struct vdpa_callback cb; struct virtqueue *vq; u64 desc_addr, driver_addr, device_addr; + union virtio_map map = {0}; /* Assume split virtqueue, switch to packed if necessary */ struct vdpa_vq_state state = {0}; u32 align, max_num, min_num = 1; @@ -185,9 +186,10 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, dma_dev = ops->get_vq_dma_dev(vdpa, index); else dma_dev = vdpa_get_dma_dev(vdpa); - vq = vring_create_virtqueue_dma(index, max_num, align, vdev, + map.dma_dev = dma_dev; + vq = vring_create_virtqueue_map(index, max_num, align, vdev, true, may_reduce_num, ctx, - notify, callback, name, dma_dev); + notify, callback, name, map); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 576e08bd7697..b4ba1a99e5ab 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -41,6 +41,11 @@ struct virtqueue { void *priv; }; +union virtio_map { + /* Device that performs DMA */ + struct device *dma_dev; +}; + int virtqueue_add_outbuf(struct virtqueue *vq, struct scatterlist sg[], unsigned int num, void *data, diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 9b33df741b63..c97a12c1cda3 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -3,6 +3,7 @@ #define _LINUX_VIRTIO_RING_H #include +#include #include #include @@ -79,9 +80,9 @@ struct virtqueue *vring_create_virtqueue(unsigned int index, /* * Creates a virtqueue and allocates the descriptor ring with per - * virtqueue DMA device. + * virtqueue mapping operations. */ -struct virtqueue *vring_create_virtqueue_dma(unsigned int index, +struct virtqueue *vring_create_virtqueue_map(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, @@ -91,7 +92,7 @@ struct virtqueue *vring_create_virtqueue_dma(unsigned int index, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name, - struct device *dma_dev); + union virtio_map map); /* * Creates a virtqueue with a standard layout but a caller-allocated -- cgit v1.2.3 From bee8c7c24b737338216dc0f87d6c47a4abaf609a Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 21 Aug 2025 14:46:38 +0800 Subject: virtio: introduce map ops in virtio core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces map operations for virtio device. Virtio used to use DMA API which is not necessarily the case since some devices doesn't do DMA. Instead of using tricks and abusing DMA API, let's simply abstract the current mapping logic into a virtio specific mapping operations. For the device or transport that doesn't do DMA, they can implement their own mapping logic without the need to trick DMA core. In this case the mapping metadata is opaque to the virtio core that will be passed back to the transport or device specific map operations. For other devices, DMA API will still be used, so map token will still be the dma device to minimize the changeset and performance impact. The mapping operations are abstracted as a independent structure instead of reusing virtio_config_ops. This allows the transport can simply reuse the structure for lower layers like vDPA. A set of new mapping helpers were introduced for the device that want to do mapping by themselves. Signed-off-by: Jason Wang Message-Id: <20250821064641.5025-7-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Eugenio Pérez --- drivers/virtio/virtio_ring.c | 211 +++++++++++++++++++++++++++++++++--------- drivers/virtio/virtio_vdpa.c | 3 + include/linux/virtio.h | 25 +++++ include/linux/virtio_config.h | 72 ++++++++++++++ 4 files changed, 269 insertions(+), 42 deletions(-) (limited to 'include/linux') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 46515b017ccb..f91a432b3e53 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -297,8 +297,14 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev) { size_t max_segment_size = SIZE_MAX; - if (vring_use_map_api(vdev)) - max_segment_size = dma_max_mapping_size(vdev->dev.parent); + if (vring_use_map_api(vdev)) { + if (vdev->map) { + max_segment_size = + vdev->map->max_mapping_size(vdev->vmap); + } else + max_segment_size = + dma_max_mapping_size(vdev->dev.parent); + } return max_segment_size; } @@ -309,8 +315,8 @@ static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, union virtio_map map) { if (vring_use_map_api(vdev)) { - return dma_alloc_coherent(map.dma_dev, size, - map_handle, flag); + return virtqueue_map_alloc_coherent(vdev, map, size, + map_handle, flag); } else { void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag); @@ -343,7 +349,8 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size, union virtio_map map) { if (vring_use_map_api(vdev)) - dma_free_coherent(map.dma_dev, size, queue, map_handle); + virtqueue_map_free_coherent(vdev, map, size, + queue, map_handle); else free_pages_exact(queue, PAGE_ALIGN(size)); } @@ -358,6 +365,20 @@ static struct device *vring_dma_dev(const struct vring_virtqueue *vq) return vq->map.dma_dev; } +static int vring_mapping_error(const struct vring_virtqueue *vq, + dma_addr_t addr) +{ + struct virtio_device *vdev = vq->vq.vdev; + + if (!vq->use_map_api) + return 0; + + if (vdev->map) + return vdev->map->mapping_error(vq->map, addr); + else + return dma_mapping_error(vring_dma_dev(vq), addr); +} + /* Map one sg entry. */ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction, dma_addr_t *addr, @@ -387,11 +408,11 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist * the way it expects (we don't guarantee that the scatterlist * will exist for the lifetime of the mapping). */ - *addr = dma_map_page(vring_dma_dev(vq), - sg_page(sg), sg->offset, sg->length, - direction); + *addr = virtqueue_map_page_attrs(&vq->vq, sg_page(sg), + sg->offset, sg->length, + direction, 0); - if (dma_mapping_error(vring_dma_dev(vq), *addr)) + if (vring_mapping_error(vq, *addr)) return -ENOMEM; return 0; @@ -408,15 +429,6 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, size, direction, 0); } -static int vring_mapping_error(const struct vring_virtqueue *vq, - dma_addr_t addr) -{ - if (!vq->use_map_api) - return 0; - - return dma_mapping_error(vring_dma_dev(vq), addr); -} - static void virtqueue_init(struct vring_virtqueue *vq, u32 num) { vq->vq.num_free = num; @@ -453,11 +465,12 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, } else if (!vring_need_unmap_buffer(vq, extra)) goto out; - dma_unmap_page(vring_dma_dev(vq), - extra->addr, - extra->len, - (flags & VRING_DESC_F_WRITE) ? - DMA_FROM_DEVICE : DMA_TO_DEVICE); + virtqueue_unmap_page_attrs(&vq->vq, + extra->addr, + extra->len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE, + 0); out: return extra->next; @@ -1271,10 +1284,11 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, } else if (!vring_need_unmap_buffer(vq, extra)) return; - dma_unmap_page(vring_dma_dev(vq), - extra->addr, extra->len, - (flags & VRING_DESC_F_WRITE) ? - DMA_FROM_DEVICE : DMA_TO_DEVICE); + virtqueue_unmap_page_attrs(&vq->vq, + extra->addr, extra->len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE, + 0); } static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg, @@ -2433,7 +2447,7 @@ struct device *virtqueue_dma_dev(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - if (vq->use_map_api) + if (vq->use_map_api && !_vq->vdev->map) return vq->map.dma_dev; else return NULL; @@ -3123,6 +3137,107 @@ const struct vring *virtqueue_get_vring(const struct virtqueue *vq) } EXPORT_SYMBOL_GPL(virtqueue_get_vring); +/** + * virtqueue_map_alloc_coherent - alloc coherent mapping + * @vdev: the virtio device we are talking to + * @map: metadata for performing mapping + * @size: the size of the buffer + * @map_handle: the pointer to the mapped address + * @gfp: allocation flag (GFP_XXX) + * + * return virtual address or NULL on error + */ +void *virtqueue_map_alloc_coherent(struct virtio_device *vdev, + union virtio_map map, + size_t size, dma_addr_t *map_handle, + gfp_t gfp) +{ + if (vdev->map) + return vdev->map->alloc(map, size, + map_handle, gfp); + else + return dma_alloc_coherent(map.dma_dev, size, + map_handle, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_map_alloc_coherent); + +/** + * virtqueue_map_free_coherent - free coherent mapping + * @vdev: the virtio device we are talking to + * @map: metadata for performing mapping + * @size: the size of the buffer + * @map_handle: the mapped address that needs to be freed + * + */ +void virtqueue_map_free_coherent(struct virtio_device *vdev, + union virtio_map map, size_t size, void *vaddr, + dma_addr_t map_handle) +{ + if (vdev->map) + vdev->map->free(map, size, vaddr, + map_handle, 0); + else + dma_free_coherent(map.dma_dev, size, vaddr, map_handle); +} +EXPORT_SYMBOL_GPL(virtqueue_map_free_coherent); + +/** + * virtqueue_map_page_attrs - map a page to the device + * @_vq: the virtqueue we are talking to + * @page: the page that will be mapped by the device + * @offset: the offset in the page for a buffer + * @size: the buffer size + * @dir: mapping direction + * @attrs: mapping attributes + * + * Returns mapped address. Caller should check that by virtqueue_mapping_error(). + */ +dma_addr_t virtqueue_map_page_attrs(const struct virtqueue *_vq, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + const struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = _vq->vdev; + + if (vdev->map) + return vdev->map->map_page(vq->map, + page, offset, size, + dir, attrs); + + return dma_map_page_attrs(vring_dma_dev(vq), + page, offset, size, + dir, attrs); +} +EXPORT_SYMBOL_GPL(virtqueue_map_page_attrs); + +/** + * virtqueue_unmap_page_attrs - map a page to the device + * @_vq: the virtqueue we are talking to + * @map_handle: the mapped address + * @size: the buffer size + * @dir: mapping direction + * @attrs: unmapping attributes + */ +void virtqueue_unmap_page_attrs(const struct virtqueue *_vq, + dma_addr_t map_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + const struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = _vq->vdev; + + if (vdev->map) + vdev->map->unmap_page(vq->map, + map_handle, size, dir, attrs); + else + dma_unmap_page_attrs(vring_dma_dev(vq), map_handle, + size, dir, attrs); +} +EXPORT_SYMBOL_GPL(virtqueue_unmap_page_attrs); + /** * virtqueue_map_single_attrs - map DMA for _vq * @_vq: the struct virtqueue we're talking about. @@ -3134,7 +3249,7 @@ EXPORT_SYMBOL_GPL(virtqueue_get_vring); * The caller calls this to do dma mapping in advance. The DMA address can be * passed to this _vq when it is in pre-mapped mode. * - * return DMA address. Caller should check that by virtqueue_mapping_error(). + * return mapped address. Caller should check that by virtqueue_mapping_error(). */ dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, @@ -3153,8 +3268,8 @@ dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, "rejecting DMA map of vmalloc memory\n")) return DMA_MAPPING_ERROR; - return dma_map_page_attrs(vring_dma_dev(vq), virt_to_page(ptr), - offset_in_page(ptr), size, dir, attrs); + return virtqueue_map_page_attrs(&vq->vq, virt_to_page(ptr), + offset_in_page(ptr), size, dir, attrs); } EXPORT_SYMBOL_GPL(virtqueue_map_single_attrs); @@ -3179,12 +3294,12 @@ void virtqueue_unmap_single_attrs(const struct virtqueue *_vq, if (!vq->use_map_api) return; - dma_unmap_page_attrs(vring_dma_dev(vq), addr, size, dir, attrs); + virtqueue_unmap_page_attrs(_vq, addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(virtqueue_unmap_single_attrs); /** - * virtqueue_map_mapping_error - check dma address + * virtqueue_mapping_error - check dma address * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * @@ -3194,10 +3309,7 @@ int virtqueue_map_mapping_error(const struct virtqueue *_vq, dma_addr_t addr) { const struct vring_virtqueue *vq = to_vvq(_vq); - if (!vq->use_map_api) - return 0; - - return dma_mapping_error(vring_dma_dev(vq), addr); + return vring_mapping_error(vq, addr); } EXPORT_SYMBOL_GPL(virtqueue_map_mapping_error); @@ -3214,11 +3326,15 @@ EXPORT_SYMBOL_GPL(virtqueue_map_mapping_error); bool virtqueue_map_need_sync(const struct virtqueue *_vq, dma_addr_t addr) { const struct vring_virtqueue *vq = to_vvq(_vq); + struct virtio_device *vdev = _vq->vdev; if (!vq->use_map_api) return false; - return dma_need_sync(vring_dma_dev(vq), addr); + if (vdev->map) + return vdev->map->need_sync(vq->map, addr); + else + return dma_need_sync(vring_dma_dev(vq), addr); } EXPORT_SYMBOL_GPL(virtqueue_map_need_sync); @@ -3240,12 +3356,17 @@ void virtqueue_map_sync_single_range_for_cpu(const struct virtqueue *_vq, enum dma_data_direction dir) { const struct vring_virtqueue *vq = to_vvq(_vq); - struct device *dev = vring_dma_dev(vq); + struct virtio_device *vdev = _vq->vdev; if (!vq->use_map_api) return; - dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); + if (vdev->map) + vdev->map->sync_single_for_cpu(vq->map, + addr + offset, size, dir); + else + dma_sync_single_range_for_cpu(vring_dma_dev(vq), + addr, offset, size, dir); } EXPORT_SYMBOL_GPL(virtqueue_map_sync_single_range_for_cpu); @@ -3266,12 +3387,18 @@ void virtqueue_map_sync_single_range_for_device(const struct virtqueue *_vq, enum dma_data_direction dir) { const struct vring_virtqueue *vq = to_vvq(_vq); - struct device *dev = vring_dma_dev(vq); + struct virtio_device *vdev = _vq->vdev; if (!vq->use_map_api) return; - dma_sync_single_range_for_device(dev, addr, offset, size, dir); + if (vdev->map) + vdev->map->sync_single_for_device(vq->map, + addr + offset, + size, dir); + else + dma_sync_single_range_for_device(vring_dma_dev(vq), addr, + offset, size, dir); } EXPORT_SYMBOL_GPL(virtqueue_map_sync_single_range_for_device); diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index dc557aa7c825..d4be689e3626 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -195,6 +195,9 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, goto error_new_virtqueue; } + if (index == 0) + vdev->vmap = map; + vq->num_max = max_num; /* Setup virtqueue callback */ diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b4ba1a99e5ab..3386a4a8d06b 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -166,9 +166,11 @@ struct virtio_device { struct virtio_device_id id; const struct virtio_config_ops *config; const struct vringh_config_ops *vringh_config; + const struct virtio_map_ops *map; struct list_head vqs; VIRTIO_DECLARE_FEATURES(features); void *priv; + union virtio_map vmap; #ifdef CONFIG_VIRTIO_DEBUG struct dentry *debugfs_dir; u64 debugfs_filter_features[VIRTIO_FEATURES_DWORDS]; @@ -267,6 +269,29 @@ void unregister_virtio_driver(struct virtio_driver *drv); module_driver(__virtio_driver, register_virtio_driver, \ unregister_virtio_driver) + +void *virtqueue_map_alloc_coherent(struct virtio_device *vdev, + union virtio_map mapping_token, + size_t size, dma_addr_t *dma_handle, + gfp_t gfp); + +void virtqueue_map_free_coherent(struct virtio_device *vdev, + union virtio_map mapping_token, + size_t size, void *vaddr, + dma_addr_t dma_handle); + +dma_addr_t virtqueue_map_page_attrs(const struct virtqueue *_vq, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction dir, + unsigned long attrs); + +void virtqueue_unmap_page_attrs(const struct virtqueue *_vq, + dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + dma_addr_t virtqueue_map_single_attrs(const struct virtqueue *_vq, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs); void virtqueue_unmap_single_attrs(const struct virtqueue *_vq, dma_addr_t addr, diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 7427b79d6f3d..16001e9f9b39 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -139,6 +139,78 @@ struct virtio_config_ops { int (*enable_vq_after_reset)(struct virtqueue *vq); }; +/** + * struct virtio_map_ops - operations for mapping buffer for a virtio device + * Note: For transport that has its own mapping logic it must + * implements all of the operations + * @map_page: map a buffer to the device + * map: metadata for performing mapping + * page: the page that will be mapped by the device + * offset: the offset in the page for a buffer + * size: the buffer size + * dir: mapping direction + * attrs: mapping attributes + * Returns: the mapped address + * @unmap_page: unmap a buffer from the device + * map: device specific mapping map + * map_handle: the mapped address + * size: the buffer size + * dir: mapping direction + * attrs: unmapping attributes + * @sync_single_for_cpu: sync a single buffer from device to cpu + * map: metadata for performing mapping + * map_handle: the mapping address to sync + * size: the size of the buffer + * dir: synchronization direction + * @sync_single_for_device: sync a single buffer from cpu to device + * map: metadata for performing mapping + * map_handle: the mapping address to sync + * size: the size of the buffer + * dir: synchronization direction + * @alloc: alloc a coherent buffer mapping + * map: metadata for performing mapping + * size: the size of the buffer + * map_handle: the mapping address to sync + * gfp: allocation flag (GFP_XXX) + * Returns: virtual address of the allocated buffer + * @free: free a coherent buffer mapping + * map: metadata for performing mapping + * size: the size of the buffer + * vaddr: virtual address of the buffer + * map_handle: the mapping address to sync + * attrs: unmapping attributes + * @need_sync: if the buffer needs synchronization + * map: metadata for performing mapping + * map_handle: the mapped address + * Returns: whether the buffer needs synchronization + * @mapping_error: if the mapping address is error + * map: metadata for performing mapping + * map_handle: the mapped address + * @max_mapping_size: get the maximum buffer size that can be mapped + * map: metadata for performing mapping + * Returns: the maximum buffer size that can be mapped + */ +struct virtio_map_ops { + dma_addr_t (*map_page)(union virtio_map map, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs); + void (*unmap_page)(union virtio_map map, dma_addr_t map_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + void (*sync_single_for_cpu)(union virtio_map map, dma_addr_t map_handle, + size_t size, enum dma_data_direction dir); + void (*sync_single_for_device)(union virtio_map map, + dma_addr_t map_handle, size_t size, + enum dma_data_direction dir); + void *(*alloc)(union virtio_map map, size_t size, + dma_addr_t *map_handle, gfp_t gfp); + void (*free)(union virtio_map map, size_t size, void *vaddr, + dma_addr_t map_handle, unsigned long attrs); + bool (*need_sync)(union virtio_map map, dma_addr_t map_handle); + int (*mapping_error)(union virtio_map map, dma_addr_t map_handle); + size_t (*max_mapping_size)(union virtio_map map); +}; + /* If driver didn't advertise the feature, it will never appear. */ void virtio_check_driver_offered_feature(const struct virtio_device *vdev, unsigned int fbit); -- cgit v1.2.3 From 58aca3dbc7d8891a016cb17d488af3002812793b Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 21 Aug 2025 14:46:39 +0800 Subject: vdpa: support virtio_map MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Virtio core switches from DMA device to virtio_map, let's do that as well for vDPA. Signed-off-by: Jason Wang Message-Id: <20250821064641.5025-8-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Eugenio Pérez --- drivers/vdpa/alibaba/eni_vdpa.c | 2 +- drivers/vdpa/ifcvf/ifcvf_main.c | 2 +- drivers/vdpa/mlx5/core/mr.c | 4 ++-- drivers/vdpa/mlx5/net/mlx5_vnet.c | 13 ++++++++----- drivers/vdpa/octeon_ep/octep_vdpa_main.c | 2 +- drivers/vdpa/pds/vdpa_dev.c | 2 +- drivers/vdpa/solidrun/snet_main.c | 4 ++-- drivers/vdpa/vdpa.c | 2 +- drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- drivers/vdpa/vdpa_user/vduse_dev.c | 2 +- drivers/vdpa/virtio_pci/vp_vdpa.c | 2 +- drivers/vhost/vdpa.c | 6 ++++-- drivers/virtio/virtio_vdpa.c | 11 +++++------ include/linux/vdpa.h | 15 ++++++++------- 14 files changed, 37 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c index ad7f3447fe90..54aea086d08c 100644 --- a/drivers/vdpa/alibaba/eni_vdpa.c +++ b/drivers/vdpa/alibaba/eni_vdpa.c @@ -496,7 +496,7 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_master(pdev); pci_set_drvdata(pdev, eni_vdpa); - eni_vdpa->vdpa.dma_dev = &pdev->dev; + eni_vdpa->vdpa.vmap.dma_dev = &pdev->dev; eni_vdpa->queues = eni_vdpa_get_num_queues(eni_vdpa); eni_vdpa->vring = devm_kcalloc(&pdev->dev, eni_vdpa->queues, diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index ccf64d7bbfaa..979d188d74ee 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -713,7 +713,7 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, ifcvf_mgmt_dev->adapter = adapter; adapter->pdev = pdev; - adapter->vdpa.dma_dev = &pdev->dev; + adapter->vdpa.vmap.dma_dev = &pdev->dev; adapter->vdpa.mdev = mdev; adapter->vf = vf; vdpa_dev = &adapter->vdpa; diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index c7a20278bc3c..8870a7169267 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -378,7 +378,7 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr u64 pa, offset; u64 paend; struct scatterlist *sg; - struct device *dma = mvdev->vdev.dma_dev; + struct device *dma = mvdev->vdev.vmap.dma_dev; for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1); map; map = vhost_iotlb_itree_next(map, mr->start, mr->end - 1)) { @@ -432,7 +432,7 @@ err_map: static void unmap_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr *mr) { - struct device *dma = mvdev->vdev.dma_dev; + struct device *dma = mvdev->vdev.vmap.dma_dev; destroy_direct_mr(mvdev, mr); dma_unmap_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 0ed2fc28e1ce..a7e76f175914 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3395,14 +3395,17 @@ static int mlx5_vdpa_reset_map(struct vdpa_device *vdev, unsigned int asid) return err; } -static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx) +static union virtio_map mlx5_get_vq_map(struct vdpa_device *vdev, u16 idx) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + union virtio_map map; if (is_ctrl_vq_idx(mvdev, idx)) - return &vdev->dev; + map.dma_dev = &vdev->dev; + else + map.dma_dev = mvdev->vdev.vmap.dma_dev; - return mvdev->vdev.dma_dev; + return map; } static void free_irqs(struct mlx5_vdpa_net *ndev) @@ -3686,7 +3689,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .set_map = mlx5_vdpa_set_map, .reset_map = mlx5_vdpa_reset_map, .set_group_asid = mlx5_set_group_asid, - .get_vq_dma_dev = mlx5_get_vq_dma_dev, + .get_vq_map = mlx5_get_vq_map, .free = mlx5_vdpa_free, .suspend = mlx5_vdpa_suspend, .resume = mlx5_vdpa_resume, /* Op disabled if not supported. */ @@ -3965,7 +3968,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, } ndev->mvdev.mlx_features = device_features; - mvdev->vdev.dma_dev = &mdev->pdev->dev; + mvdev->vdev.vmap.dma_dev = &mdev->pdev->dev; err = mlx5_vdpa_alloc_resources(&ndev->mvdev); if (err) goto err_alloc; diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c index 9b49efd24391..5818dae133a3 100644 --- a/drivers/vdpa/octeon_ep/octep_vdpa_main.c +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c @@ -516,7 +516,7 @@ static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, } oct_vdpa->pdev = pdev; - oct_vdpa->vdpa.dma_dev = &pdev->dev; + oct_vdpa->vdpa.vmap.dma_dev = &pdev->dev; oct_vdpa->vdpa.mdev = mdev; oct_vdpa->oct_hw = oct_hw; vdpa_dev = &oct_vdpa->vdpa; diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index 301d95e08596..63d82263fb52 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -643,7 +643,7 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, pdev = vdpa_aux->padev->vf_pdev; dma_dev = &pdev->dev; - pdsv->vdpa_dev.dma_dev = dma_dev; + pdsv->vdpa_dev.vmap.dma_dev = dma_dev; status = pds_vdpa_get_status(&pdsv->vdpa_dev); if (status == 0xff) { diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c index 55ec51c17ab3..39050aab147f 100644 --- a/drivers/vdpa/solidrun/snet_main.c +++ b/drivers/vdpa/solidrun/snet_main.c @@ -1052,8 +1052,8 @@ static int snet_vdpa_probe_vf(struct pci_dev *pdev) */ snet_reserve_irq_idx(pf_irqs ? pdev_pf : pdev, snet); - /*set DMA device*/ - snet->vdpa.dma_dev = &pdev->dev; + /* set map metadata */ + snet->vdpa.vmap.dma_dev = &pdev->dev; /* Register VDPA device */ ret = vdpa_register_device(&snet->vdpa, snet->cfg->vq_num); diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 8a372b51c21a..c71debeb8471 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -151,7 +151,7 @@ static void vdpa_release_dev(struct device *d) * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. * - * Return: Returns an error when parent/config/dma_dev is not set or fail to get + * Return: Returns an error when parent/config/map is not set or fail to get * ida. */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index c204fc8e471a..22ee53538444 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -272,7 +272,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, vringh_set_iotlb(&vdpasim->vqs[i].vring, &vdpasim->iommu[0], &vdpasim->iommu_lock); - vdpasim->vdpa.dma_dev = dev; + vdpasim->vdpa.vmap.dma_dev = dev; return vdpasim; diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index 04620bb77203..f68ed569394c 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -2022,7 +2022,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) return ret; } set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops); - vdev->vdpa.dma_dev = &vdev->vdpa.dev; + vdev->vdpa.vmap.dma_dev = &vdev->vdpa.dev; vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev; return 0; diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 8787407f75b0..242641c0f2bd 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -520,7 +520,7 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, vp_vdpa_mgtdev->vp_vdpa = vp_vdpa; - vp_vdpa->vdpa.dma_dev = &pdev->dev; + vp_vdpa->vdpa.vmap.dma_dev = &pdev->dev; vp_vdpa->queues = vp_modern_get_num_queues(mdev); vp_vdpa->mdev = mdev; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index af1e1fdfd9ed..05a481e4c385 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1318,7 +1318,8 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; - struct device *dma_dev = vdpa_get_dma_dev(vdpa); + union virtio_map map = vdpa_get_map(vdpa); + struct device *dma_dev = map.dma_dev; int ret; /* Device want to do DMA by itself */ @@ -1353,7 +1354,8 @@ err_attach: static void vhost_vdpa_free_domain(struct vhost_vdpa *v) { struct vdpa_device *vdpa = v->vdpa; - struct device *dma_dev = vdpa_get_dma_dev(vdpa); + union virtio_map map = vdpa_get_map(vdpa); + struct device *dma_dev = map.dma_dev; if (v->domain) { iommu_detach_device(v->domain, dma_dev); diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index d4be689e3626..8b27c6e8eebb 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -133,7 +133,6 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, const char *name, bool ctx) { struct vdpa_device *vdpa = vd_get_vdpa(vdev); - struct device *dma_dev; const struct vdpa_config_ops *ops = vdpa->config; bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify; struct vdpa_callback cb; @@ -182,11 +181,11 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, /* Create the vring */ align = ops->get_vq_align(vdpa); - if (ops->get_vq_dma_dev) - dma_dev = ops->get_vq_dma_dev(vdpa, index); + if (ops->get_vq_map) + map = ops->get_vq_map(vdpa, index); else - dma_dev = vdpa_get_dma_dev(vdpa); - map.dma_dev = dma_dev; + map = vdpa_get_map(vdpa); + vq = vring_create_virtqueue_map(index, max_num, align, vdev, true, may_reduce_num, ctx, notify, callback, name, map); @@ -467,7 +466,7 @@ static int virtio_vdpa_probe(struct vdpa_device *vdpa) if (!vd_dev) return -ENOMEM; - vd_dev->vdev.dev.parent = vdpa_get_dma_dev(vdpa); + vd_dev->vdev.dev.parent = vdpa_get_map(vdpa).dma_dev; vd_dev->vdev.dev.release = virtio_vdpa_release_dev; vd_dev->vdev.config = &virtio_vdpa_config_ops; vd_dev->vdpa = vdpa; diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 2e7a30fe6b92..ae0451945851 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,7 @@ struct vdpa_mgmt_dev; /** * struct vdpa_device - representation of a vDPA device * @dev: underlying device - * @dma_dev: the actual device that is performing DMA + * @vmap: the metadata passed to upper layer to be used for mapping * @driver_override: driver name to force a match; do not set directly, * because core frees it; use driver_set_override() to * set or clear it. @@ -87,7 +88,7 @@ struct vdpa_mgmt_dev; */ struct vdpa_device { struct device dev; - struct device *dma_dev; + union virtio_map vmap; const char *driver_override; const struct vdpa_config_ops *config; struct rw_semaphore cf_lock; /* Protects get/set config */ @@ -352,11 +353,11 @@ struct vdpa_map_file { * @vdev: vdpa device * @asid: address space identifier * Returns integer: success (0) or error (< 0) - * @get_vq_dma_dev: Get the dma device for a specific + * @get_vq_map: Get the map metadata for a specific * virtqueue (optional) * @vdev: vdpa device * @idx: virtqueue index - * Returns pointer to structure device or error (NULL) + * Returns map token union error (NULL) * @bind_mm: Bind the device to a specific address space * so the vDPA framework can use VA when this * callback is implemented. (optional) @@ -436,7 +437,7 @@ struct vdpa_config_ops { int (*reset_map)(struct vdpa_device *vdev, unsigned int asid); int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group, unsigned int asid); - struct device *(*get_vq_dma_dev)(struct vdpa_device *vdev, u16 idx); + union virtio_map (*get_vq_map)(struct vdpa_device *vdev, u16 idx); int (*bind_mm)(struct vdpa_device *vdev, struct mm_struct *mm); void (*unbind_mm)(struct vdpa_device *vdev); @@ -520,9 +521,9 @@ static inline void vdpa_set_drvdata(struct vdpa_device *vdev, void *data) dev_set_drvdata(&vdev->dev, data); } -static inline struct device *vdpa_get_dma_dev(struct vdpa_device *vdev) +static inline union virtio_map vdpa_get_map(struct vdpa_device *vdev) { - return vdev->dma_dev; + return vdev->vmap; } static inline int vdpa_reset(struct vdpa_device *vdev, u32 flags) -- cgit v1.2.3 From 0d16cc439f36355d04b17ac45c3001d90969aa44 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 24 Sep 2025 15:00:44 +0800 Subject: vdpa: introduce map ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Virtio core allows the transport to provide device or transport specific mapping functions. This patch adds this support to vDPA. We can simply do this by allowing the vDPA parent to register a virtio_map_ops. Reviewed-by: Christoph Hellwig Signed-off-by: Jason Wang Message-Id: <20250924070045.10361-2-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Eugenio Pérez --- drivers/vdpa/alibaba/eni_vdpa.c | 3 ++- drivers/vdpa/ifcvf/ifcvf_main.c | 3 ++- drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +- drivers/vdpa/octeon_ep/octep_vdpa_main.c | 4 ++-- drivers/vdpa/pds/vdpa_dev.c | 3 ++- drivers/vdpa/solidrun/snet_main.c | 4 ++-- drivers/vdpa/vdpa.c | 3 +++ drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- drivers/vdpa/vdpa_user/vduse_dev.c | 3 ++- drivers/vdpa/virtio_pci/vp_vdpa.c | 3 ++- drivers/virtio/virtio_vdpa.c | 4 +++- include/linux/vdpa.h | 10 +++++++--- 12 files changed, 29 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/vdpa/alibaba/eni_vdpa.c b/drivers/vdpa/alibaba/eni_vdpa.c index 54aea086d08c..e476504db0c8 100644 --- a/drivers/vdpa/alibaba/eni_vdpa.c +++ b/drivers/vdpa/alibaba/eni_vdpa.c @@ -478,7 +478,8 @@ static int eni_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; eni_vdpa = vdpa_alloc_device(struct eni_vdpa, vdpa, - dev, &eni_vdpa_ops, 1, 1, NULL, false); + dev, &eni_vdpa_ops, NULL, + 1, 1, NULL, false); if (IS_ERR(eni_vdpa)) { ENI_ERR(pdev, "failed to allocate vDPA structure\n"); return PTR_ERR(eni_vdpa); diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 979d188d74ee..6658dc74d915 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -705,7 +705,8 @@ static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, vf = &ifcvf_mgmt_dev->vf; pdev = vf->pdev; adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, - &pdev->dev, &ifc_vdpa_ops, 1, 1, NULL, false); + &pdev->dev, &ifc_vdpa_ops, + NULL, 1, 1, NULL, false); if (IS_ERR(adapter)) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); return PTR_ERR(adapter); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index a7e76f175914..82034efb74fc 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -3882,7 +3882,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, } ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mgtdev->vdpa_ops, - MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false); + NULL, MLX5_VDPA_NUMVQ_GROUPS, MLX5_VDPA_NUM_AS, name, false); if (IS_ERR(ndev)) return PTR_ERR(ndev); diff --git a/drivers/vdpa/octeon_ep/octep_vdpa_main.c b/drivers/vdpa/octeon_ep/octep_vdpa_main.c index 5818dae133a3..9e8d07078606 100644 --- a/drivers/vdpa/octeon_ep/octep_vdpa_main.c +++ b/drivers/vdpa/octeon_ep/octep_vdpa_main.c @@ -508,8 +508,8 @@ static int octep_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, u64 device_features; int ret; - oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, 1, 1, - NULL, false); + oct_vdpa = vdpa_alloc_device(struct octep_vdpa, vdpa, &pdev->dev, &octep_vdpa_ops, + NULL, 1, 1, NULL, false); if (IS_ERR(oct_vdpa)) { dev_err(&pdev->dev, "Failed to allocate vDPA structure for octep vdpa device"); return PTR_ERR(oct_vdpa); diff --git a/drivers/vdpa/pds/vdpa_dev.c b/drivers/vdpa/pds/vdpa_dev.c index 63d82263fb52..36f61cc96e21 100644 --- a/drivers/vdpa/pds/vdpa_dev.c +++ b/drivers/vdpa/pds/vdpa_dev.c @@ -632,7 +632,8 @@ static int pds_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, } pdsv = vdpa_alloc_device(struct pds_vdpa_device, vdpa_dev, - dev, &pds_vdpa_ops, 1, 1, name, false); + dev, &pds_vdpa_ops, NULL, + 1, 1, name, false); if (IS_ERR(pdsv)) { dev_err(dev, "Failed to allocate vDPA structure: %pe\n", pdsv); return PTR_ERR(pdsv); diff --git a/drivers/vdpa/solidrun/snet_main.c b/drivers/vdpa/solidrun/snet_main.c index 39050aab147f..4588211d57eb 100644 --- a/drivers/vdpa/solidrun/snet_main.c +++ b/drivers/vdpa/solidrun/snet_main.c @@ -1008,8 +1008,8 @@ static int snet_vdpa_probe_vf(struct pci_dev *pdev) } /* Allocate vdpa device */ - snet = vdpa_alloc_device(struct snet, vdpa, &pdev->dev, &snet_config_ops, 1, 1, NULL, - false); + snet = vdpa_alloc_device(struct snet, vdpa, &pdev->dev, &snet_config_ops, + NULL, 1, 1, NULL, false); if (!snet) { SNET_ERR(pdev, "Failed to allocate a vdpa device\n"); ret = -ENOMEM; diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index c71debeb8471..34874beb0152 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -142,6 +142,7 @@ static void vdpa_release_dev(struct device *d) * initialized but before registered. * @parent: the parent device * @config: the bus operations that is supported by this device + * @map: the map operations that is supported by this device * @ngroups: number of groups supported by this device * @nas: number of address spaces supported by this device * @size: size of the parent structure that contains private data @@ -156,6 +157,7 @@ static void vdpa_release_dev(struct device *d) */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, + const struct virtio_map_ops *map, unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va) @@ -187,6 +189,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, vdev->dev.release = vdpa_release_dev; vdev->index = err; vdev->config = config; + vdev->map = map; vdev->features_valid = false; vdev->use_va = use_va; vdev->ngroups = ngroups; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 22ee53538444..c1c6431950e1 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -215,7 +215,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, else ops = &vdpasim_config_ops; - vdpa = __vdpa_alloc_device(NULL, ops, + vdpa = __vdpa_alloc_device(NULL, ops, NULL, dev_attr->ngroups, dev_attr->nas, dev_attr->alloc_size, dev_attr->name, use_va); diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index f68ed569394c..ad782d20a8ed 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -2009,7 +2009,8 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) return -EEXIST; vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, - &vduse_vdpa_config_ops, 1, 1, name, true); + &vduse_vdpa_config_ops, NULL, + 1, 1, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 242641c0f2bd..17a19a728c9c 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -511,7 +511,8 @@ static int vp_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name, int ret, i; vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, - dev, &vp_vdpa_ops, 1, 1, name, false); + dev, &vp_vdpa_ops, NULL, + 1, 1, name, false); if (IS_ERR(vp_vdpa)) { dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 8b27c6e8eebb..2464fa655f09 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -466,9 +466,11 @@ static int virtio_vdpa_probe(struct vdpa_device *vdpa) if (!vd_dev) return -ENOMEM; - vd_dev->vdev.dev.parent = vdpa_get_map(vdpa).dma_dev; + vd_dev->vdev.dev.parent = vdpa->map ? &vdpa->dev : + vdpa_get_map(vdpa).dma_dev; vd_dev->vdev.dev.release = virtio_vdpa_release_dev; vd_dev->vdev.config = &virtio_vdpa_config_ops; + vd_dev->vdev.map = vdpa->map; vd_dev->vdpa = vdpa; vd_dev->vdev.id.device = ops->get_device_id(vdpa); diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index ae0451945851..4cf21d6e9cfd 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -76,6 +76,7 @@ struct vdpa_mgmt_dev; * because core frees it; use driver_set_override() to * set or clear it. * @config: the configuration ops for this device. + * @map: the map ops for this device * @cf_lock: Protects get and set access to configuration layout. * @index: device index * @features_valid: were features initialized? for legacy guests @@ -91,6 +92,7 @@ struct vdpa_device { union virtio_map vmap; const char *driver_override; const struct vdpa_config_ops *config; + const struct virtio_map_ops *map; struct rw_semaphore cf_lock; /* Protects get/set config */ unsigned int index; bool features_valid; @@ -447,6 +449,7 @@ struct vdpa_config_ops { struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, + const struct virtio_map_ops *map, unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va); @@ -458,6 +461,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, * @member: the name of struct vdpa_device within the @dev_struct * @parent: the parent device * @config: the bus operations that is supported by this device + * @map: the map operations that is supported by this device * @ngroups: the number of virtqueue groups supported by this device * @nas: the number of address spaces * @name: name of the vdpa device @@ -465,10 +469,10 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, * * Return allocated data structure or ERR_PTR upon error */ -#define vdpa_alloc_device(dev_struct, member, parent, config, ngroups, nas, \ - name, use_va) \ +#define vdpa_alloc_device(dev_struct, member, parent, config, map, \ + ngroups, nas, name, use_va) \ container_of((__vdpa_alloc_device( \ - parent, config, ngroups, nas, \ + parent, config, map, ngroups, nas, \ (sizeof(dev_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ dev_struct, member))), name, use_va)), \ -- cgit v1.2.3 From 1c14b0e4ba988381e362ad8a9651eff0b21bd47f Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 24 Sep 2025 15:00:45 +0800 Subject: vduse: switch to use virtio map API instead of DMA API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lacking the support of device specific mapping supported in virtio, VDUSE must trick the DMA API in order to make virtio-vdpa transport work. This is done by advertising vDPA device as dma device with a VDUSE specific dma_ops even if it doesn't do DMA at all. This will be fixed by this patch. Thanks to the new mapping operations support by virtio and vDPA. VDUSE can simply switch to advertise its specific mappings operations to virtio via virtio-vdpa then DMA API is not needed for VDUSE any more and iova domain could be used as the mapping token instead. Signed-off-by: Jason Wang Message-Id: <20250924070045.10361-3-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Eugenio Pérez --- drivers/vdpa/Kconfig | 8 +--- drivers/vdpa/vdpa_user/iova_domain.c | 2 +- drivers/vdpa/vdpa_user/iova_domain.h | 2 +- drivers/vdpa/vdpa_user/vduse_dev.c | 78 ++++++++++++++++++------------------ include/linux/virtio.h | 4 ++ 5 files changed, 46 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 559fb9d3271f..857cf288c876 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -34,13 +34,7 @@ config VDPA_SIM_BLOCK config VDPA_USER tristate "VDUSE (vDPA Device in Userspace) support" - depends on EVENTFD && MMU && HAS_DMA - # - # This driver incorrectly tries to override the dma_ops. It should - # never have done that, but for now keep it working on architectures - # that use dma ops - # - depends on ARCH_HAS_DMA_OPS + depends on EVENTFD && MMU select VHOST_IOTLB select IOMMU_IOVA help diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index 58116f89d8da..ccaed24b7ef8 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -447,7 +447,7 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, size_t size, dma_addr_t *dma_addr, - gfp_t flag, unsigned long attrs) + gfp_t flag) { struct iova_domain *iovad = &domain->consistent_iovad; unsigned long limit = domain->iova_limit; diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index 7f3f0928ec78..1f3c30be272a 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -64,7 +64,7 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, size_t size, dma_addr_t *dma_addr, - gfp_t flag, unsigned long attrs); + gfp_t flag); void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, void *vaddr, dma_addr_t dma_addr, diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index ad782d20a8ed..e7bced0b5542 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -814,59 +814,53 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .free = vduse_vdpa_free, }; -static void vduse_dev_sync_single_for_device(struct device *dev, +static void vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; vduse_domain_sync_single_for_device(domain, dma_addr, size, dir); } -static void vduse_dev_sync_single_for_cpu(struct device *dev, +static void vduse_dev_sync_single_for_cpu(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir); } -static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page, +static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; return vduse_domain_map_page(domain, page, offset, size, dir, attrs); } -static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) +static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); } -static void *vduse_dev_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_addr, gfp_t flag, - unsigned long attrs) +static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size, + dma_addr_t *dma_addr, gfp_t flag) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; unsigned long iova; void *addr; *dma_addr = DMA_MAPPING_ERROR; addr = vduse_domain_alloc_coherent(domain, size, - (dma_addr_t *)&iova, flag, attrs); + (dma_addr_t *)&iova, flag); if (!addr) return NULL; @@ -875,31 +869,45 @@ static void *vduse_dev_alloc_coherent(struct device *dev, size_t size, return addr; } -static void vduse_dev_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs) +static void vduse_dev_free_coherent(union virtio_map token, size_t size, + void *vaddr, dma_addr_t dma_addr, + unsigned long attrs) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs); } -static size_t vduse_dev_max_mapping_size(struct device *dev) +static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr) { - struct vduse_dev *vdev = dev_to_vduse(dev); - struct vduse_iova_domain *domain = vdev->domain; + struct vduse_iova_domain *domain = token.iova_domain; + + return dma_addr < domain->bounce_size; +} + +static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr) +{ + if (unlikely(dma_addr == DMA_MAPPING_ERROR)) + return -ENOMEM; + return 0; +} + +static size_t vduse_dev_max_mapping_size(union virtio_map token) +{ + struct vduse_iova_domain *domain = token.iova_domain; return domain->bounce_size; } -static const struct dma_map_ops vduse_dev_dma_ops = { +static const struct virtio_map_ops vduse_map_ops = { .sync_single_for_device = vduse_dev_sync_single_for_device, .sync_single_for_cpu = vduse_dev_sync_single_for_cpu, .map_page = vduse_dev_map_page, .unmap_page = vduse_dev_unmap_page, .alloc = vduse_dev_alloc_coherent, .free = vduse_dev_free_coherent, + .need_sync = vduse_dev_need_sync, + .mapping_error = vduse_dev_mapping_error, .max_mapping_size = vduse_dev_max_mapping_size, }; @@ -2003,27 +2011,18 @@ static struct vduse_mgmt_dev *vduse_mgmt; static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) { struct vduse_vdpa *vdev; - int ret; if (dev->vdev) return -EEXIST; vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, - &vduse_vdpa_config_ops, NULL, + &vduse_vdpa_config_ops, &vduse_map_ops, 1, 1, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); dev->vdev = vdev; vdev->dev = dev; - vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask; - ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64)); - if (ret) { - put_device(&vdev->vdpa.dev); - return ret; - } - set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops); - vdev->vdpa.vmap.dma_dev = &vdev->vdpa.dev; vdev->vdpa.mdev = &vduse_mgmt->mgmt_dev; return 0; @@ -2056,6 +2055,7 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, return -ENOMEM; } + dev->vdev->vdpa.vmap.iova_domain = dev->domain; ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); if (ret) { put_device(&dev->vdev->vdpa.dev); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 3386a4a8d06b..96c66126c074 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -41,9 +41,13 @@ struct virtqueue { void *priv; }; +struct vduse_iova_domain; + union virtio_map { /* Device that performs DMA */ struct device *dma_dev; + /* VDUSE specific mapping data */ + struct vduse_iova_domain *iova_domain; }; int virtqueue_add_outbuf(struct virtqueue *vq, -- cgit v1.2.3 From f97aef092e199c10a3da96ae79b571edd5362faa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 12:12:37 +0200 Subject: cpufreq: Make drivers using CPUFREQ_ETERNAL specify transition latency Commit a755d0e2d41b ("cpufreq: Honour transition_latency over transition_delay_us") caused platforms where cpuinfo.transition_latency is CPUFREQ_ETERNAL to get a very large transition latency whereas previously it had been capped at 10 ms (and later at 2 ms). This led to a user-observable regression between 6.6 and 6.12 as described by Shawn: "The dbs sampling_rate was 10000 us on 6.6 and suddently becomes 6442450 us (4294967295 / 1000 * 1.5) on 6.12 for these platforms because the default transition delay was dropped [...]. It slows down dbs governor's reacting to CPU loading change dramatically. Also, as transition_delay_us is used by schedutil governor as rate_limit_us, it shows a negative impact on device idle power consumption, because the device gets slightly less time in the lowest OPP." Evidently, the expectation of the drivers using CPUFREQ_ETERNAL as cpuinfo.transition_latency was that it would be capped by the core, but they may as well return a default transition latency value instead of CPUFREQ_ETERNAL and the core need not do anything with it. Accordingly, introduce CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS and make all of the drivers in question use it instead of CPUFREQ_ETERNAL. Also update the related Rust binding. Fixes: a755d0e2d41b ("cpufreq: Honour transition_latency over transition_delay_us") Closes: https://lore.kernel.org/linux-pm/20250922125929.453444-1-shawnguo2@yeah.net/ Reported-by: Shawn Guo Reviewed-by: Mario Limonciello (AMD) Reviewed-by: Jie Zhan Acked-by: Viresh Kumar Cc: 6.6+ # 6.6+ Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2264949.irdbgypaU6@rafael.j.wysocki [ rjw: Fix typo in new symbol name, drop redundant type cast from Rust binding ] Tested-by: Shawn Guo # with cpufreq-dt driver Reviewed-by: Qais Yousef Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 2 +- drivers/cpufreq/imx6q-cpufreq.c | 2 +- drivers/cpufreq/mediatek-cpufreq-hw.c | 2 +- drivers/cpufreq/rcpufreq_dt.rs | 2 +- drivers/cpufreq/scmi-cpufreq.c | 2 +- drivers/cpufreq/scpi-cpufreq.c | 2 +- drivers/cpufreq/spear-cpufreq.c | 2 +- include/linux/cpufreq.h | 3 +++ rust/kernel/cpufreq.rs | 7 ++++--- 9 files changed, 14 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 506437489b4d..7d5079fd1688 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -104,7 +104,7 @@ static int cpufreq_init(struct cpufreq_policy *policy) transition_latency = dev_pm_opp_get_max_transition_latency(cpu_dev); if (!transition_latency) - transition_latency = CPUFREQ_ETERNAL; + transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; cpumask_copy(policy->cpus, priv->cpus); policy->driver_data = priv; diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index db1c88e9d3f9..e93697d3edfd 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -442,7 +442,7 @@ soc_opp_out: } if (of_property_read_u32(np, "clock-latency", &transition_latency)) - transition_latency = CPUFREQ_ETERNAL; + transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; /* * Calculate the ramp time for max voltage change in the diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index fce5aa5ceea0..ae4500ab4891 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -309,7 +309,7 @@ static int mtk_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) latency = readl_relaxed(data->reg_bases[REG_FREQ_LATENCY]) * 1000; if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; policy->fast_switch_possible = true; diff --git a/drivers/cpufreq/rcpufreq_dt.rs b/drivers/cpufreq/rcpufreq_dt.rs index 7e1fbf9a091f..3909022e1c74 100644 --- a/drivers/cpufreq/rcpufreq_dt.rs +++ b/drivers/cpufreq/rcpufreq_dt.rs @@ -123,7 +123,7 @@ impl cpufreq::Driver for CPUFreqDTDriver { let mut transition_latency = opp_table.max_transition_latency_ns() as u32; if transition_latency == 0 { - transition_latency = cpufreq::ETERNAL_LATENCY_NS; + transition_latency = cpufreq::DEFAULT_TRANSITION_LATENCY_NS; } policy diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 38c165d526d1..d2a110079f5f 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -294,7 +294,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) latency = perf_ops->transition_latency_get(ph, domain); if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index dcbb0ae7dd47..e530345baddf 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -157,7 +157,7 @@ static int scpi_cpufreq_init(struct cpufreq_policy *policy) latency = scpi_ops->get_transition_latency(cpu_dev); if (!latency) - latency = CPUFREQ_ETERNAL; + latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; policy->cpuinfo.transition_latency = latency; diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c index 707c71090cc3..2a1550e1aa21 100644 --- a/drivers/cpufreq/spear-cpufreq.c +++ b/drivers/cpufreq/spear-cpufreq.c @@ -182,7 +182,7 @@ static int spear_cpufreq_probe(struct platform_device *pdev) if (of_property_read_u32(np, "clock-latency", &spear_cpufreq.transition_latency)) - spear_cpufreq.transition_latency = CPUFREQ_ETERNAL; + spear_cpufreq.transition_latency = CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; cnt = of_property_count_u32_elems(np, "cpufreq_tbl"); if (cnt <= 0) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 40966512ea18..bc8c083bc16a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -32,6 +32,9 @@ */ #define CPUFREQ_ETERNAL (-1) + +#define CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS NSEC_PER_MSEC + #define CPUFREQ_NAME_LEN 16 /* Print length for names. Extra 1 space for accommodating '\n' in prints */ #define CPUFREQ_NAME_PLEN (CPUFREQ_NAME_LEN + 1) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index eea57ba95f24..2ea735700ae7 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -39,7 +39,8 @@ use macros::vtable; const CPUFREQ_NAME_LEN: usize = bindings::CPUFREQ_NAME_LEN as usize; /// Default transition latency value in nanoseconds. -pub const ETERNAL_LATENCY_NS: u32 = bindings::CPUFREQ_ETERNAL as u32; +pub const DEFAULT_TRANSITION_LATENCY_NS: u32 = + bindings::CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS; /// CPU frequency driver flags. pub mod flags { @@ -400,13 +401,13 @@ impl TableBuilder { /// The following example demonstrates how to create a CPU frequency table. /// /// ``` -/// use kernel::cpufreq::{ETERNAL_LATENCY_NS, Policy}; +/// use kernel::cpufreq::{DEFAULT_TRANSITION_LATENCY_NS, Policy}; /// /// fn update_policy(policy: &mut Policy) { /// policy /// .set_dvfs_possible_from_any_cpu(true) /// .set_fast_switch_possible(true) -/// .set_transition_latency_ns(ETERNAL_LATENCY_NS); +/// .set_transition_latency_ns(DEFAULT_TRANSITION_LATENCY_NS); /// /// pr_info!("The policy details are: {:?}\n", (policy.cpu(), policy.cur())); /// } -- cgit v1.2.3 From 950c6451a5c38d375993c3b9da427e2e69b01c30 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 26 Sep 2025 12:31:47 +0200 Subject: cpufreq: Drop unused symbol CPUFREQ_ETERNAL Drop CPUFREQ_ETERNAL that has no users any more along with all references to it in the documentation. No functional impact. Reviewed-by: Mario Limonciello (AMD) Reviewed-by: Jie Zhan Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Reviewed-by: Qais Yousef --- Documentation/admin-guide/pm/cpufreq.rst | 4 ---- Documentation/cpu-freq/cpu-drivers.rst | 3 +-- Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst | 3 +-- Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst | 3 +-- include/linux/cpufreq.h | 5 ----- 5 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst index cacb9f0307dd..738d7b4dc33a 100644 --- a/Documentation/admin-guide/pm/cpufreq.rst +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -274,10 +274,6 @@ are the following: The time it takes to switch the CPUs belonging to this policy from one P-state to another, in nanoseconds. - If unknown or if known to be so high that the scaling driver does not - work with the `ondemand`_ governor, -1 (:c:macro:`CPUFREQ_ETERNAL`) - will be returned by reads from this attribute. - ``related_cpus`` List of all (online and offline) CPUs belonging to this policy. diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst index d84ededb66f9..c5635ac3de54 100644 --- a/Documentation/cpu-freq/cpu-drivers.rst +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -109,8 +109,7 @@ Then, the driver must fill in the following values: +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | the time it takes on this CPU to | | | switch between two frequencies in | -| | nanoseconds (if appropriate, else | -| | specify CPUFREQ_ETERNAL) | +| | nanoseconds | +-----------------------------------+--------------------------------------+ |policy->cur | The current operating frequency of | | | this CPU (if appropriate) | diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst index 2ca92042767b..8238f4c6e4f5 100644 --- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -112,8 +112,7 @@ CPUfreq核心层注册一个cpufreq_driver结构体。 | | | +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | CPU在两个频率之间切换所需的时间,以 | -| | 纳秒为单位(如不适用,设定为 | -| | CPUFREQ_ETERNAL) | +| | 纳秒为单位 | | | | +-----------------------------------+--------------------------------------+ |policy->cur | 该CPU当前的工作频率(如适用) | diff --git a/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst index add3de2d4523..5435c3928d4b 100644 --- a/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_TW/cpu-freq/cpu-drivers.rst @@ -112,8 +112,7 @@ CPUfreq核心層註冊一個cpufreq_driver結構體。 | | | +-----------------------------------+--------------------------------------+ |policy->cpuinfo.transition_latency | CPU在兩個頻率之間切換所需的時間,以 | -| | 納秒爲單位(如不適用,設定爲 | -| | CPUFREQ_ETERNAL) | +| | 納秒爲單位 | | | | +-----------------------------------+--------------------------------------+ |policy->cur | 該CPU當前的工作頻率(如適用) | diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index bc8c083bc16a..0465d1e6f72a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -26,13 +26,8 @@ *********************************************************************/ /* * Frequency values here are CPU kHz - * - * Maximum transition latency is in nanoseconds - if it's unknown, - * CPUFREQ_ETERNAL shall be used. */ -#define CPUFREQ_ETERNAL (-1) - #define CPUFREQ_DEFAULT_TRANSITION_LATENCY_NS NSEC_PER_MSEC #define CPUFREQ_NAME_LEN 16 -- cgit v1.2.3 From 4e66293bb141df33d5eb1f922e16fe05913bf296 Mon Sep 17 00:00:00 2001 From: Bhanu Seshu Kumar Valluri Date: Wed, 1 Oct 2025 14:27:16 +0530 Subject: of: doc: Fix typo in doc comments. synthetized => synthesized definied => defined sucess => success Signed-off-by: Bhanu Seshu Kumar Valluri Signed-off-by: Rob Herring (Arm) --- drivers/of/irq.c | 2 +- drivers/of/overlay.c | 2 +- include/linux/of.h | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/of/irq.c b/drivers/of/irq.c index e7c12abd10ab..1ca66b6e267c 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -163,7 +163,7 @@ const __be32 *of_irq_parse_imap_parent(const __be32 *imap, int len, struct of_ph * @out_irq: structure of_phandle_args updated by this function * * This function is a low-level interrupt tree walking function. It - * can be used to do a partial walk with synthetized reg and interrupts + * can be used to do a partial walk with synthesized reg and interrupts * properties, for example when resolving PCI interrupts when no device * node exist for the parent. It takes an interrupt specifier structure as * input, walks the tree looking for any interrupt-map properties, translates diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c index 1af6f52d0708..255e8362f600 100644 --- a/drivers/of/overlay.c +++ b/drivers/of/overlay.c @@ -135,7 +135,7 @@ static BLOCKING_NOTIFIER_HEAD(overlay_notify_chain); * @nb: Notifier block to register * * Register for notification on overlay operations on device tree nodes. The - * reported actions definied by @of_reconfig_change. The notifier callback + * reported actions defined by @of_reconfig_change. The notifier callback * furthermore receives a pointer to the affected device tree node. * * Note that a notifier callback is not supposed to store pointers to a device diff --git a/include/linux/of.h b/include/linux/of.h index 5e2c6ed9370a..121a288ca92d 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -1134,7 +1134,7 @@ static inline bool of_phandle_args_equal(const struct of_phandle_args *a1, * Search for a property in a device node and count the number of u8 elements * in it. * - * Return: The number of elements on sucess, -EINVAL if the property does + * Return: The number of elements on success, -EINVAL if the property does * not exist or its length does not match a multiple of u8 and -ENODATA if the * property does not have a value. */ @@ -1153,7 +1153,7 @@ static inline int of_property_count_u8_elems(const struct device_node *np, * Search for a property in a device node and count the number of u16 elements * in it. * - * Return: The number of elements on sucess, -EINVAL if the property does + * Return: The number of elements on success, -EINVAL if the property does * not exist or its length does not match a multiple of u16 and -ENODATA if the * property does not have a value. */ @@ -1172,7 +1172,7 @@ static inline int of_property_count_u16_elems(const struct device_node *np, * Search for a property in a device node and count the number of u32 elements * in it. * - * Return: The number of elements on sucess, -EINVAL if the property does + * Return: The number of elements on success, -EINVAL if the property does * not exist or its length does not match a multiple of u32 and -ENODATA if the * property does not have a value. */ @@ -1191,7 +1191,7 @@ static inline int of_property_count_u32_elems(const struct device_node *np, * Search for a property in a device node and count the number of u64 elements * in it. * - * Return: The number of elements on sucess, -EINVAL if the property does + * Return: The number of elements on success, -EINVAL if the property does * not exist or its length does not match a multiple of u64 and -ENODATA if the * property does not have a value. */ -- cgit v1.2.3 From 510d76646a6a7beaa49fc0da7282e285a3dfce97 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Tue, 23 Sep 2025 19:21:36 +0800 Subject: block: Update a comment of disk statistics >From commit 074a7aca7afa ("block: move stats from disk to part0"), we know that: * {disk|all}_stat_*() are gone. * disk_stat_lock/unlock() are renamed to part_stat_lock/unlock(). Therefore, outdated comments should be updated accordingly. Fixes: 074a7aca7afa ("block: move stats from disk to part0") Signed-off-by: Tang Yizhou Signed-off-by: Jens Axboe --- include/linux/part_stat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index eeeff2a04529..729415e91215 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -17,8 +17,8 @@ struct disk_stats { /* * Macros to operate on percpu disk statistics: * - * {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters and should - * be called between disk_stat_lock() and disk_stat_unlock(). + * part_stat_{add|sub|inc|dec}() modify the stat counters and should + * be called between part_stat_lock() and part_stat_unlock(). * * part_stat_read() can be called at any time. */ -- cgit v1.2.3 From 1b54b0756f051c11f5a5d0fbc1581e0b9a18e2bc Mon Sep 17 00:00:00 2001 From: Bhanu Seshu Kumar Valluri Date: Wed, 1 Oct 2025 16:27:15 +0530 Subject: net: doc: Fix typos in docs Fix typos in doc comments. Signed-off-by: Bhanu Seshu Kumar Valluri Link: https://patch.msgid.link/20251001105715.50462-1-bhanuseshukumar@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++-- net/tipc/crypto.c | 2 +- net/tipc/topsrv.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7a54a8b4d277..3c7634482356 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -297,7 +297,7 @@ static inline const char *phy_modes(phy_interface_t interface) * * Description: maps RGMII supported link speeds into the clock rates. * This can also be used for MII, GMII, and RMII interface modes as the - * clock rates are indentical, but the caller must be aware that errors + * clock rates are identical, but the caller must be aware that errors * for unsupported clock rates will not be signalled. * * Returns: clock rate or negative errno @@ -519,7 +519,7 @@ enum phy_state { * struct phy_c45_device_ids - 802.3-c45 Device Identifiers * @devices_in_package: IEEE 802.3 devices in package register value. * @mmds_present: bit vector of MMDs present. - * @device_ids: The device identifer for each present device. + * @device_ids: The device identifier for each present device. */ struct phy_c45_device_ids { u32 devices_in_package; diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index ea5bb131ebd0..751904f10aab 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1797,7 +1797,7 @@ exit: * @b: bearer where the message has been received * * If the decryption is successful, the decrypted skb is returned directly or - * as the callback, the encryption header and auth tag will be trimed out + * as the callback, the encryption header and auth tag will be trimmed out * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). * Otherwise, the skb will be freed! * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index ffe577bf6b51..aad7f96b6009 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -57,7 +57,7 @@ * @conn_idr: identifier set of connection * @idr_lock: protect the connection identifier set * @idr_in_use: amount of allocated identifier entry - * @net: network namspace instance + * @net: network namespace instance * @awork: accept work item * @rcv_wq: receive workqueue * @send_wq: send workqueue @@ -83,7 +83,7 @@ struct tipc_topsrv { * @sock: socket handler associated with connection * @flags: indicates connection state * @server: pointer to connected server - * @sub_list: lsit to all pertaing subscriptions + * @sub_list: list to all pertaining subscriptions * @sub_lock: lock protecting the subscription list * @rwork: receive work item * @outqueue: pointer to first outbound message in queue -- cgit v1.2.3 From 384b52ce32110db974d3b61d463af48347eb73fb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 1 Oct 2025 19:34:29 +0200 Subject: PM: runtime: Introduce one more usage counter guard Follow previous commit 9a0abc39450a ("PM: runtime: Add auto-cleanup macros for "resume and get" operations") and define a runtime PM usage counter guard in which pm_runtime_get_noresume() and pm_runtime_put_noidle() will be used for incrementing and decrementing it, respectively. Signed-off-by: Rafael J. Wysocki Reviewed-by: Jonathan Cameron --- include/linux/pm_runtime.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index edb8aed5ef62..a3f44f6c2da1 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -614,6 +614,9 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) return __pm_runtime_put_autosuspend(dev); } +DEFINE_GUARD(pm_runtime_noresume, struct device *, + pm_runtime_get_noresume(_T), pm_runtime_put_noidle(_T)); + DEFINE_GUARD(pm_runtime_active, struct device *, pm_runtime_get_sync(_T), pm_runtime_put(_T)); DEFINE_GUARD(pm_runtime_active_auto, struct device *, -- cgit v1.2.3 From b8179af120943e2fc099ea87caa234039a709a66 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 29 Jul 2025 08:46:35 +0200 Subject: mm/memory_hotplug: activate node before adding new memory blocks The sysfs attributes for memory blocks require the node ID to be set and initialized, so move the node activation before adding new memory blocks. This also has the nice side effect that the BUG_ON() can be converted into a WARN_ON() as we now can handle registration errors. Link: https://lkml.kernel.org/r/20250729064637.51662-3-hare@kernel.org Fixes: b9ff036082cd ("mm/memory_hotplug.c: make add_memory_resource use __try_online_node") Signed-off-by: Hannes Reinecke Acked-by: David Hildenbrand Acked-by: Oscar Salvador Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- drivers/base/memory.c | 4 ++-- include/linux/memory.h | 2 +- mm/memory_hotplug.c | 32 +++++++++++++++++--------------- 3 files changed, 20 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 894d3891292b..fb212a889e65 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -879,7 +879,7 @@ static void remove_memory_block(struct memory_block *memory) * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size, - struct vmem_altmap *altmap, + int nid, struct vmem_altmap *altmap, struct memory_group *group) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); @@ -893,7 +893,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = add_memory_block(block_id, NUMA_NO_NODE, MEM_OFFLINE, altmap, group); + ret = add_memory_block(block_id, nid, MEM_OFFLINE, altmap, group); if (ret) break; } diff --git a/include/linux/memory.h b/include/linux/memory.h index 40eb70ccb09d..4a29153e372e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -159,7 +159,7 @@ static inline unsigned long memory_block_advised_max_size(void) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - struct vmem_altmap *altmap, + int nid, struct vmem_altmap *altmap, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e9f14de4a9c9..0be83039c3b5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1477,7 +1477,7 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group, } /* create memory block devices after memory was added */ - ret = create_memory_block_devices(cur_start, memblock_size, + ret = create_memory_block_devices(cur_start, memblock_size, nid, params.altmap, group); if (ret) { arch_remove_memory(cur_start, memblock_size, NULL); @@ -1539,8 +1539,16 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) ret = __try_online_node(nid, false); if (ret < 0) - goto error; - new_node = ret; + goto error_memblock_remove; + if (ret) { + node_set_online(nid); + ret = register_one_node(nid); + if (WARN_ON(ret)) { + node_set_offline(nid); + goto error_memblock_remove; + } + new_node = true; + } /* * Self hosted memmap array @@ -1556,24 +1564,13 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, NULL, group); + ret = create_memory_block_devices(start, size, nid, NULL, group); if (ret) { arch_remove_memory(start, size, params.altmap); goto error; } } - if (new_node) { - /* If sysfs file of new node can't be created, cpu on the node - * can't be hot-added. There is no rollback way now. - * So, check by BUG_ON() to catch it reluctantly.. - * We online node here. We can't roll back from here. - */ - node_set_online(nid); - ret = register_one_node(nid); - BUG_ON(ret); - } - register_memory_blocks_under_node_hotplug(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); @@ -1597,6 +1594,11 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) return ret; error: + if (new_node) { + node_set_offline(nid); + unregister_one_node(nid); + } +error_memblock_remove: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); error_mem_hotplug_end: -- cgit v1.2.3 From 0a947c14e48cbf9de222836170282e0167a9e096 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 29 Jul 2025 08:46:36 +0200 Subject: drivers/base: move memory_block_add_nid() into the caller Now the node id only needs to be set for early memory, so move memory_block_add_nid() into the caller and rename it into memory_block_add_nid_early(). This allows us to further simplify the code by dropping the 'context' argument to do_register_memory_block_under_node(). Link: https://lkml.kernel.org/r/20250729064637.51662-4-hare@kernel.org Suggested-by: David Hildenbrand Signed-off-by: Hannes Reinecke Acked-by: David Hildenbrand Acked-by: Oscar Salvador Reviewed-by: Donet Tom Signed-off-by: Andrew Morton --- drivers/base/memory.c | 36 ++++++++++++++++++------------------ drivers/base/node.c | 10 ++++------ include/linux/memory.h | 3 +-- 3 files changed, 23 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index fb212a889e65..6d84a02cfa5d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -769,21 +769,22 @@ static struct zone *early_node_zone_for_memory_block(struct memory_block *mem, #ifdef CONFIG_NUMA /** - * memory_block_add_nid() - Indicate that system RAM falling into this memory - * block device (partially) belongs to the given node. + * memory_block_add_nid_early() - Indicate that early system RAM falling into + * this memory block device (partially) belongs + * to the given node. * @mem: The memory block device. * @nid: The node id. - * @context: The memory initialization context. * - * Indicate that system RAM falling into this memory block (partially) belongs - * to the given node. If the context indicates ("early") that we are adding the - * node during node device subsystem initialization, this will also properly - * set/adjust mem->zone based on the zone ranges of the given node. + * Indicate that early system RAM falling into this memory block (partially) + * belongs to the given node. This will also properly set/adjust mem->zone based + * on the zone ranges of the given node. + * + * Memory hotplug handles this on memory block creation, where we can only have + * a single nid span a memory block. */ -void memory_block_add_nid(struct memory_block *mem, int nid, - enum meminit_context context) +void memory_block_add_nid_early(struct memory_block *mem, int nid) { - if (context == MEMINIT_EARLY && mem->nid != nid) { + if (mem->nid != nid) { /* * For early memory we have to determine the zone when setting * the node id and handle multiple nodes spanning a single @@ -797,15 +798,14 @@ void memory_block_add_nid(struct memory_block *mem, int nid, mem->zone = early_node_zone_for_memory_block(mem, nid); else mem->zone = NULL; + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. If we span multiple nodes (not applicable + * to hotplugged memory), zone == NULL will prohibit memory offlining + * and consequently unplug. + */ + mem->nid = nid; } - - /* - * If this memory block spans multiple nodes, we only indicate - * the last processed node. If we span multiple nodes (not applicable - * to hotplugged memory), zone == NULL will prohibit memory offlining - * and consequently unplug. - */ - mem->nid = nid; } #endif diff --git a/drivers/base/node.c b/drivers/base/node.c index 67b01d579737..6b6e55a98b79 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -781,13 +781,10 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) #ifdef CONFIG_MEMORY_HOTPLUG static void do_register_memory_block_under_node(int nid, - struct memory_block *mem_blk, - enum meminit_context context) + struct memory_block *mem_blk) { int ret; - memory_block_add_nid(mem_blk, nid, context); - ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, &mem_blk->dev.kobj, kobject_name(&mem_blk->dev.kobj)); @@ -815,7 +812,7 @@ static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, { int nid = *(int *)arg; - do_register_memory_block_under_node(nid, mem_blk, MEMINIT_HOTPLUG); + do_register_memory_block_under_node(nid, mem_blk); return 0; } @@ -855,7 +852,8 @@ static void register_memory_blocks_under_nodes(void) if (!mem) continue; - do_register_memory_block_under_node(nid, mem, MEMINIT_EARLY); + memory_block_add_nid_early(mem, nid); + do_register_memory_block_under_node(nid, mem); put_device(&mem->dev); } diff --git a/include/linux/memory.h b/include/linux/memory.h index 4a29153e372e..43d378038ce2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -202,8 +202,7 @@ static inline unsigned long phys_to_block_id(unsigned long phys) } #ifdef CONFIG_NUMA -void memory_block_add_nid(struct memory_block *mem, int nid, - enum meminit_context context); +void memory_block_add_nid_early(struct memory_block *mem, int nid); #endif /* CONFIG_NUMA */ int memory_block_advise_max_size(unsigned long size); unsigned long memory_block_advised_max_size(void); -- cgit v1.2.3 From b71a6e2a1b710ea31c363a72c75f510ef35d8e69 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Thu, 2 Oct 2025 17:12:35 +0900 Subject: i2c: rename wait_for_completion callback to wait_for_completion_cb Functionally no change. Remove the ambiguity of 'wait_for_completion'. It helps development of the DEPT dependency tracker, but seems favorable in any case. Signed-off-by: Byungchul Park [wsa: reworded commit message] Signed-off-by: Wolfram Sang --- drivers/i2c/algos/i2c-algo-pca.c | 2 +- drivers/i2c/busses/i2c-pca-isa.c | 2 +- drivers/i2c/busses/i2c-pca-platform.c | 2 +- include/linux/i2c-algo-pca.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/algos/i2c-algo-pca.c b/drivers/i2c/algos/i2c-algo-pca.c index 74b66aec33d4..ee86df4cff4b 100644 --- a/drivers/i2c/algos/i2c-algo-pca.c +++ b/drivers/i2c/algos/i2c-algo-pca.c @@ -30,7 +30,7 @@ static int i2c_debug; #define pca_clock(adap) adap->i2c_clock #define pca_set_con(adap, val) pca_outw(adap, I2C_PCA_CON, val) #define pca_get_con(adap) pca_inw(adap, I2C_PCA_CON) -#define pca_wait(adap) adap->wait_for_completion(adap->data) +#define pca_wait(adap) adap->wait_for_completion_cb(adap->data) static void pca_reset(struct i2c_algo_pca_data *adap) { diff --git a/drivers/i2c/busses/i2c-pca-isa.c b/drivers/i2c/busses/i2c-pca-isa.c index 85e8cf58e8bf..0cbf2f509527 100644 --- a/drivers/i2c/busses/i2c-pca-isa.c +++ b/drivers/i2c/busses/i2c-pca-isa.c @@ -95,7 +95,7 @@ static struct i2c_algo_pca_data pca_isa_data = { /* .data intentionally left NULL, not needed with ISA */ .write_byte = pca_isa_writebyte, .read_byte = pca_isa_readbyte, - .wait_for_completion = pca_isa_waitforcompletion, + .wait_for_completion_cb = pca_isa_waitforcompletion, .reset_chip = pca_isa_resetchip, }; diff --git a/drivers/i2c/busses/i2c-pca-platform.c b/drivers/i2c/busses/i2c-pca-platform.c index 87da8241b927..c0f35ebbe37d 100644 --- a/drivers/i2c/busses/i2c-pca-platform.c +++ b/drivers/i2c/busses/i2c-pca-platform.c @@ -180,7 +180,7 @@ static int i2c_pca_pf_probe(struct platform_device *pdev) } i2c->algo_data.data = i2c; - i2c->algo_data.wait_for_completion = i2c_pca_pf_waitforcompletion; + i2c->algo_data.wait_for_completion_cb = i2c_pca_pf_waitforcompletion; if (i2c->gpio) i2c->algo_data.reset_chip = i2c_pca_pf_resetchip; else diff --git a/include/linux/i2c-algo-pca.h b/include/linux/i2c-algo-pca.h index 7c522fdd9ea7..e305bf32e40a 100644 --- a/include/linux/i2c-algo-pca.h +++ b/include/linux/i2c-algo-pca.h @@ -71,7 +71,7 @@ struct i2c_algo_pca_data { void *data; /* private low level data */ void (*write_byte) (void *data, int reg, int val); int (*read_byte) (void *data, int reg); - int (*wait_for_completion) (void *data); + int (*wait_for_completion_cb) (void *data); void (*reset_chip) (void *data); /* For PCA9564, use one of the predefined frequencies: * 330000, 288000, 217000, 146000, 88000, 59000, 44000, 36000 -- cgit v1.2.3 From 929bf010e0599ddef6b640cd314f1de65dd1ca3e Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Thu, 14 Aug 2025 14:47:10 +0800 Subject: mm: introduce num_pages_contiguous() Let's add a simple helper for determining the number of contiguous pages that represent contiguous PFNs. In an ideal world, this helper would be simpler or not even required. Unfortunately, on some configs we still have to maintain (SPARSEMEM without VMEMMAP), the memmap is allocated per memory section, and we might run into weird corner cases of false positives when blindly testing for contiguous pages only. One example of such false positives would be a memory section-sized hole that does not have a memmap. The surrounding memory sections might get "struct pages" that are contiguous, but the PFNs are actually not. This helper will, for example, be useful for determining contiguous PFNs in a GUP result, to batch further operations across returned "struct page"s. VFIO will utilize this interface to accelerate the VFIO DMA map process. Implementation based on Linus' suggestions to avoid new usage of nth_page() where avoidable. Suggested-by: Linus Torvalds Suggested-by: Jason Gunthorpe Signed-off-by: Li Zhe Co-developed-by: David Hildenbrand Signed-off-by: David Hildenbrand Link: https://lore.kernel.org/r/20250814064714.56485-2-lizhe.67@bytedance.com Signed-off-by: Alex Williamson --- include/linux/mm.h | 7 ++++++- include/linux/mm_inline.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 06978b4dbeb8..f092ce3530bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1833,7 +1833,12 @@ static inline unsigned long memdesc_section(memdesc_flags_t mdf) { return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } -#endif +#else /* !SECTION_IN_PAGE_FLAGS */ +static inline unsigned long memdesc_section(memdesc_flags_t mdf) +{ + return 0; +} +#endif /* SECTION_IN_PAGE_FLAGS */ /** * folio_pfn - Return the Page Frame Number of a folio. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index d6c1011b38f2..f6a2b2d20016 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -617,4 +617,40 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma) return true; } +/** + * num_pages_contiguous() - determine the number of contiguous pages + * that represent contiguous PFNs + * @pages: an array of page pointers + * @nr_pages: length of the array, at least 1 + * + * Determine the number of contiguous pages that represent contiguous PFNs + * in @pages, starting from the first page. + * + * In some kernel configs contiguous PFNs will not have contiguous struct + * pages. In these configurations num_pages_contiguous() will return a num + * smaller than ideal number. The caller should continue to check for pfn + * contiguity after each call to num_pages_contiguous(). + * + * Returns the number of contiguous pages. + */ +static inline size_t num_pages_contiguous(struct page **pages, size_t nr_pages) +{ + struct page *cur_page = pages[0]; + unsigned long section = memdesc_section(cur_page->flags); + size_t i; + + for (i = 1; i < nr_pages; i++) { + if (++cur_page != pages[i]) + break; + /* + * In unproblematic kernel configs, page_to_section() == 0 and + * the whole check will get optimized out. + */ + if (memdesc_section(cur_page->flags) != section) + break; + } + + return i; +} + #endif -- cgit v1.2.3 From 95920c2ed02bde551ab654e9749c2ca7bc3100e0 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 30 Sep 2025 13:43:29 +0200 Subject: page_pool: Fix PP_MAGIC_MASK to avoid crashing on some 32-bit arches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Helge reported that the introduction of PP_MAGIC_MASK let to crashes on boot on his 32-bit parisc machine. The cause of this is the mask is set too wide, so the page_pool_page_is_pp() incurs false positives which crashes the machine. Just disabling the check in page_pool_is_pp() will lead to the page_pool code itself malfunctioning; so instead of doing this, this patch changes the define for PP_DMA_INDEX_BITS to avoid mistaking arbitrary kernel pointers for page_pool-tagged pages. The fix relies on the kernel pointers that alias with the pp_magic field always being above PAGE_OFFSET. With this assumption, we can use the lowest bit of the value of PAGE_OFFSET as the upper bound of the PP_DMA_INDEX_MASK, which should avoid the false positives. Because we cannot rely on PAGE_OFFSET always being a compile-time constant, nor on it always being >0, we fall back to disabling the dma_index storage when there are not enough bits available. This leaves us in the situation we were in before the patch in the Fixes tag, but only on a subset of architecture configurations. This seems to be the best we can do until the transition to page types in complete for page_pool pages. v2: - Make sure there's at least 8 bits available and that the PAGE_OFFSET bit calculation doesn't wrap Link: https://lore.kernel.org/all/aMNJMFa5fDalFmtn@p100/ Fixes: ee62ce7a1d90 ("page_pool: Track DMA-mapped pages and unmap them when destroying the pool") Cc: stable@vger.kernel.org # 6.15+ Tested-by: Helge Deller Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: Mina Almasry Tested-by: Helge Deller Link: https://patch.msgid.link/20250930114331.675412-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mm.h | 22 ++++++++------- net/core/page_pool.c | 76 ++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 66 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ae97a0b8ec7..0905eb6b55ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4159,14 +4159,13 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); * since this value becomes part of PP_SIGNATURE; meaning we can just use the * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is - * 0, we make sure that we leave the two topmost bits empty, as that guarantees - * we won't mistake a valid kernel pointer for a value we set, regardless of the - * VMSPLIT setting. + * 0, we use the lowest bit of PAGE_OFFSET as the boundary if that value is + * known at compile-time. * - * Altogether, this means that the number of bits available is constrained by - * the size of an unsigned long (at the upper end, subtracting two bits per the - * above), and the definition of PP_SIGNATURE (with or without - * POISON_POINTER_DELTA). + * If the value of PAGE_OFFSET is not known at compile time, or if it is too + * small to leave at least 8 bits available above PP_SIGNATURE, we define the + * number of bits to be 0, which turns off the DMA index tracking altogether + * (see page_pool_register_dma_index()). */ #define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) #if POISON_POINTER_DELTA > 0 @@ -4175,8 +4174,13 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); */ #define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) #else -/* Always leave out the topmost two; see above. */ -#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +/* Use the lowest bit of PAGE_OFFSET if there's at least 8 bits available; see above */ +#define PP_DMA_INDEX_MIN_OFFSET (1 << (PP_DMA_INDEX_SHIFT + 8)) +#define PP_DMA_INDEX_BITS ((__builtin_constant_p(PAGE_OFFSET) && \ + PAGE_OFFSET >= PP_DMA_INDEX_MIN_OFFSET && \ + !(PAGE_OFFSET & (PP_DMA_INDEX_MIN_OFFSET - 1))) ? \ + MIN(32, __ffs(PAGE_OFFSET) - PP_DMA_INDEX_SHIFT) : 0) + #endif #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 492728f9e021..1a5edec485f1 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -468,11 +468,60 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, } } +static int page_pool_register_dma_index(struct page_pool *pool, + netmem_ref netmem, gfp_t gfp) +{ + int err = 0; + u32 id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + goto out; + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto out; + } + + netmem_set_dma_index(netmem, id); +out: + return err; +} + +static int page_pool_release_dma_index(struct page_pool *pool, + netmem_ref netmem) +{ + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + return 0; + + id = netmem_get_dma_index(netmem); + if (!id) + return -1; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return -1; + + netmem_set_dma_index(netmem, 0); + + return 0; +} + static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; int err; - u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -491,18 +540,10 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t g goto unmap_failed; } - if (in_softirq()) - err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), - PP_DMA_INDEX_LIMIT, gfp); - else - err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), - PP_DMA_INDEX_LIMIT, gfp); - if (err) { - WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + err = page_pool_register_dma_index(pool, netmem, gfp); + if (err) goto unset_failed; - } - netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; @@ -680,8 +721,6 @@ void page_pool_clear_pp_info(netmem_ref netmem) static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool, netmem_ref netmem) { - struct page *old, *page = netmem_to_page(netmem); - unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -690,15 +729,7 @@ static __always_inline void __page_pool_release_netmem_dma(struct page_pool *poo */ return; - id = netmem_get_dma_index(netmem); - if (!id) - return; - - if (in_softirq()) - old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); - else - old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); - if (old != page) + if (page_pool_release_dma_index(pool, netmem)) return; dma = page_pool_get_dma_addr_netmem(netmem); @@ -708,7 +739,6 @@ static __always_inline void __page_pool_release_netmem_dma(struct page_pool *poo PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); - netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need -- cgit v1.2.3 From 1ed06c83506ecaaf1836ddeb7c65772ff86d8d53 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Oct 2025 11:06:25 +0200 Subject: block: remove bio_iov_iter_get_pages Switch the only caller to bio_iov_iter_get_pages, and explain why it does not have any alignment requirements. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Keith Busch Reviewed-by: Qu Wenruo Signed-off-by: Jens Axboe --- block/blk-map.c | 6 +++++- include/linux/bio.h | 5 ----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/blk-map.c b/block/blk-map.c index 165f2234f00f..6cce652c7fa6 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -283,7 +283,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); if (!bio) return -ENOMEM; - ret = bio_iov_iter_get_pages(bio, iter); + /* + * No alignment requirements on our part to support arbitrary + * passthrough commands. + */ + ret = bio_iov_iter_get_pages_aligned(bio, iter, 0); if (ret) goto out_put; ret = blk_rq_append_bio(rq, bio); diff --git a/include/linux/bio.h b/include/linux/bio.h index a64a30131031..b01dae9506de 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -449,11 +449,6 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask); -static inline int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) -{ - return bio_iov_iter_get_pages_aligned(bio, iter, 0); -} - void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); -- cgit v1.2.3 From 82dd5d763c9b718e2d655b9565e0a06a91bb83dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Oct 2025 11:06:26 +0200 Subject: block: rename bio_iov_iter_get_pages_aligned to bio_iov_iter_get_pages Now that the bio_iov_iter_get_pages is free again, use it instead of the more complicated now. Also drop the unused export. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Keith Busch Reviewed-by: Qu Wenruo Signed-off-by: Jens Axboe --- block/bio.c | 5 ++--- block/blk-map.c | 2 +- include/linux/bio.h | 2 +- include/linux/blkdev.h | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 3a1a848940dd..b3a79285c278 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1316,7 +1316,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, } /** - * bio_iov_iter_get_pages_aligned - add user or kernel pages to a bio + * bio_iov_iter_get_pages - add user or kernel pages to a bio * @bio: bio to add pages to * @iter: iov iterator describing the region to be added * @len_align_mask: the mask to align the total size to, 0 for any length @@ -1336,7 +1336,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ -int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { int ret = 0; @@ -1360,7 +1360,6 @@ int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, return bio_iov_iter_align_down(bio, iter, len_align_mask); return ret; } -EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages_aligned); static void submit_bio_wait_endio(struct bio *bio) { diff --git a/block/blk-map.c b/block/blk-map.c index 6cce652c7fa6..60faf036fb6e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -287,7 +287,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, * No alignment requirements on our part to support arbitrary * passthrough commands. */ - ret = bio_iov_iter_get_pages_aligned(bio, iter, 0); + ret = bio_iov_iter_get_pages(bio, iter, 0); if (ret) goto out_put; ret = blk_rq_append_bio(rq, bio); diff --git a/include/linux/bio.h b/include/linux/bio.h index b01dae9506de..16c1c85613b7 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -446,7 +446,7 @@ int submit_bio_wait(struct bio *bio); int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, size_t len, enum req_op op); -int bio_iov_iter_get_pages_aligned(struct bio *bio, struct iov_iter *iter, +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask); void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 066e5309bd45..c97e8b0e67b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1876,7 +1876,7 @@ static inline int bio_split_rw_at(struct bio *bio, static inline int bio_iov_iter_get_bdev_pages(struct bio *bio, struct iov_iter *iter, struct block_device *bdev) { - return bio_iov_iter_get_pages_aligned(bio, iter, + return bio_iov_iter_get_pages(bio, iter, bdev_logical_block_size(bdev) - 1); } -- cgit v1.2.3 From 506aa235f6e0baa00bf792df82a5e9f618b7a5d8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Oct 2025 11:06:28 +0200 Subject: block: move bio_iov_iter_get_bdev_pages to block/fops.c Keep bio_iov_iter_get_bdev_pages local with the callers, as blindly looking at the bdev logical block size is often not the best idea unless on a block device. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Keith Busch Reviewed-by: Qu Wenruo Signed-off-by: Jens Axboe --- block/fops.c | 13 ++++++++++--- include/linux/blkdev.h | 7 ------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/block/fops.c b/block/fops.c index c2c0396ea9ee..5e3db9fead77 100644 --- a/block/fops.c +++ b/block/fops.c @@ -43,6 +43,13 @@ static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, (bdev_logical_block_size(bdev) - 1); } +static inline int blkdev_iov_iter_get_pages(struct bio *bio, + struct iov_iter *iter, struct block_device *bdev) +{ + return bio_iov_iter_get_pages(bio, iter, + bdev_logical_block_size(bdev) - 1); +} + #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, @@ -78,7 +85,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_ATOMIC) bio.bi_opf |= REQ_ATOMIC; - ret = bio_iov_iter_get_bdev_pages(&bio, iter, bdev); + ret = blkdev_iov_iter_get_pages(&bio, iter, bdev); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; @@ -212,7 +219,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; - ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); + ret = blkdev_iov_iter_get_pages(bio, iter, bdev); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); @@ -348,7 +355,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, */ bio_iov_bvec_set(bio, iter); } else { - ret = bio_iov_iter_get_bdev_pages(bio, iter, bdev); + ret = blkdev_iov_iter_get_pages(bio, iter, bdev); if (unlikely(ret)) goto out_bio_put; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c97e8b0e67b6..d4db5039836d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1873,13 +1873,6 @@ static inline int bio_split_rw_at(struct bio *bio, return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); } -static inline int bio_iov_iter_get_bdev_pages(struct bio *bio, - struct iov_iter *iter, struct block_device *bdev) -{ - return bio_iov_iter_get_pages(bio, iter, - bdev_logical_block_size(bdev) - 1); -} - #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 8375b76517cb52bac0903071feedc218c45d74d2 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 21 Sep 2025 08:44:56 +0300 Subject: kho: replace kho_preserve_phys() with kho_preserve_pages() to make it clear that KHO operates on pages rather than on a random physical address. The kho_preserve_pages() will be also used in upcoming support for vmalloc preservation. Link: https://lkml.kernel.org/r/20250921054458.4043761-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Reviewed-by: Jason Gunthorpe Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Chris Li Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 5 +++-- kernel/kexec_handover.c | 25 +++++++++++-------------- mm/memblock.c | 4 +++- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 559d13a3bc44..cec663b39861 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -18,6 +18,7 @@ enum kho_event { struct folio; struct notifier_block; +struct page; #define DECLARE_KHOSER_PTR(name, type) \ union { \ @@ -43,7 +44,7 @@ bool kho_is_enabled(void); bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); -int kho_preserve_phys(phys_addr_t phys, size_t size); +int kho_preserve_pages(struct page *page, unsigned int nr_pages); struct folio *kho_restore_folio(phys_addr_t phys); int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); @@ -71,7 +72,7 @@ static inline int kho_preserve_folio(struct folio *folio) return -EOPNOTSUPP; } -static inline int kho_preserve_phys(phys_addr_t phys, size_t size) +static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages) { return -EOPNOTSUPP; } diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index b8d0d63f6145..1c44a55f758e 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -725,26 +725,23 @@ int kho_preserve_folio(struct folio *folio) EXPORT_SYMBOL_GPL(kho_preserve_folio); /** - * kho_preserve_phys - preserve a physically contiguous range across kexec. - * @phys: physical address of the range. - * @size: size of the range. + * kho_preserve_pages - preserve contiguous pages across kexec + * @page: first page in the list. + * @nr_pages: number of pages. * - * Instructs KHO to preserve the memory range from @phys to @phys + @size - * across kexec. + * Preserve a contiguous list of order 0 pages. Must be restored using + * kho_restore_pages() to ensure the pages are restored properly as order 0. * * Return: 0 on success, error code on failure */ -int kho_preserve_phys(phys_addr_t phys, size_t size) +int kho_preserve_pages(struct page *page, unsigned int nr_pages) { - unsigned long pfn = PHYS_PFN(phys); + struct kho_mem_track *track = &kho_out.ser.track; + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn = start_pfn; unsigned long failed_pfn = 0; - const unsigned long start_pfn = pfn; - const unsigned long end_pfn = PHYS_PFN(phys + size); int err = 0; - struct kho_mem_track *track = &kho_out.ser.track; - - if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) - return -EINVAL; while (pfn < end_pfn) { const unsigned int order = @@ -764,7 +761,7 @@ int kho_preserve_phys(phys_addr_t phys, size_t size) return err; } -EXPORT_SYMBOL_GPL(kho_preserve_phys); +EXPORT_SYMBOL_GPL(kho_preserve_pages); /* Handling for debug/kho/out */ diff --git a/mm/memblock.c b/mm/memblock.c index 120a501a887a..e23e16618e9b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2452,8 +2452,10 @@ static int reserve_mem_kho_finalize(struct kho_serialization *ser) for (i = 0; i < reserved_mem_count; i++) { struct reserve_mem_table *map = &reserved_mem_table[i]; + struct page *page = phys_to_page(map->start); + unsigned int nr_pages = map->size >> PAGE_SHIFT; - err |= kho_preserve_phys(map->start, map->size); + err |= kho_preserve_pages(page, nr_pages); } err |= kho_preserve_folio(page_folio(kho_fdt)); -- cgit v1.2.3 From a667300bd53f272a3055238bcefe108f88836270 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 21 Sep 2025 08:44:57 +0300 Subject: kho: add support for preserving vmalloc allocations A vmalloc allocation is preserved using binary structure similar to global KHO memory tracker. It's a linked list of pages where each page is an array of physical address of pages in vmalloc area. kho_preserve_vmalloc() hands out the physical address of the head page to the caller. This address is used as the argument to kho_vmalloc_restore() to restore the mapping in the vmalloc address space and populate it with the preserved pages. [pasha.tatashin@soleen.com: free chunks using free_page() not kfree()] Link: https://lkml.kernel.org/r/mafs0a52idbeg.fsf@kernel.org [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20250921054458.4043761-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Chris Li Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 28 ++++ kernel/kexec_handover.c | 281 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 309 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index cec663b39861..25042c1d8d54 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -39,13 +39,24 @@ struct page; struct kho_serialization; +struct kho_vmalloc_chunk; +struct kho_vmalloc { + DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *); + unsigned int total_pages; + unsigned short flags; + unsigned short order; +}; + #ifdef CONFIG_KEXEC_HANDOVER bool kho_is_enabled(void); bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); int kho_preserve_pages(struct page *page, unsigned int nr_pages); +int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); struct folio *kho_restore_folio(phys_addr_t phys); +struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); @@ -77,11 +88,28 @@ static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages) return -EOPNOTSUPP; } +static inline int kho_preserve_vmalloc(void *ptr, + struct kho_vmalloc *preservation) +{ + return -EOPNOTSUPP; +} + static inline struct folio *kho_restore_folio(phys_addr_t phys) { return NULL; } +static inline struct page *kho_restore_pages(phys_addr_t phys, + unsigned int nr_pages) +{ + return NULL; +} + +static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) +{ + return NULL; +} + static inline int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt) { diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 1c44a55f758e..76f0940fb485 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -274,6 +275,37 @@ struct folio *kho_restore_folio(phys_addr_t phys) } EXPORT_SYMBOL_GPL(kho_restore_folio); +/** + * kho_restore_pages - restore list of contiguous order 0 pages. + * @phys: physical address of the first page. + * @nr_pages: number of pages. + * + * Restore a contiguous list of order 0 pages that was preserved with + * kho_preserve_pages(). + * + * Return: 0 on success, error code on failure + */ +struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) +{ + const unsigned long start_pfn = PHYS_PFN(phys); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn = start_pfn; + + while (pfn < end_pfn) { + const unsigned int order = + min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); + struct page *page = kho_restore_page(PFN_PHYS(pfn)); + + if (!page) + return NULL; + split_page(page, order); + pfn += 1 << order; + } + + return pfn_to_page(start_pfn); +} +EXPORT_SYMBOL_GPL(kho_restore_pages); + /* Serialize and deserialize struct kho_mem_phys across kexec * * Record all the bitmaps in a linked list of pages for the next kernel to @@ -763,6 +795,255 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) } EXPORT_SYMBOL_GPL(kho_preserve_pages); +struct kho_vmalloc_hdr { + DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); +}; + +#define KHO_VMALLOC_SIZE \ + ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ + sizeof(phys_addr_t)) + +struct kho_vmalloc_chunk { + struct kho_vmalloc_hdr hdr; + phys_addr_t phys[KHO_VMALLOC_SIZE]; +}; + +static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); + +/* vmalloc flags KHO supports */ +#define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) + +/* KHO internal flags for vmalloc preservations */ +#define KHO_VMALLOC_ALLOC 0x0001 +#define KHO_VMALLOC_HUGE_VMAP 0x0002 + +static unsigned short vmalloc_flags_to_kho(unsigned int vm_flags) +{ + unsigned short kho_flags = 0; + + if (vm_flags & VM_ALLOC) + kho_flags |= KHO_VMALLOC_ALLOC; + if (vm_flags & VM_ALLOW_HUGE_VMAP) + kho_flags |= KHO_VMALLOC_HUGE_VMAP; + + return kho_flags; +} + +static unsigned int kho_flags_to_vmalloc(unsigned short kho_flags) +{ + unsigned int vm_flags = 0; + + if (kho_flags & KHO_VMALLOC_ALLOC) + vm_flags |= VM_ALLOC; + if (kho_flags & KHO_VMALLOC_HUGE_VMAP) + vm_flags |= VM_ALLOW_HUGE_VMAP; + + return vm_flags; +} + +static struct kho_vmalloc_chunk *new_vmalloc_chunk(struct kho_vmalloc_chunk *cur) +{ + struct kho_vmalloc_chunk *chunk; + int err; + + chunk = (struct kho_vmalloc_chunk *)get_zeroed_page(GFP_KERNEL); + if (!chunk) + return NULL; + + err = kho_preserve_pages(virt_to_page(chunk), 1); + if (err) + goto err_free; + if (cur) + KHOSER_STORE_PTR(cur->hdr.next, chunk); + return chunk; + +err_free: + free_page((unsigned long)chunk); + return NULL; +} + +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) +{ + struct kho_mem_track *track = &kho_out.ser.track; + unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); + + __kho_unpreserve(track, pfn, pfn + 1); + + for (int i = 0; chunk->phys[i]; i++) { + pfn = PHYS_PFN(chunk->phys[i]); + __kho_unpreserve(track, pfn, pfn + 1); + } +} + +static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(kho_vmalloc->first); + + while (chunk) { + struct kho_vmalloc_chunk *tmp = chunk; + + kho_vmalloc_unpreserve_chunk(chunk); + + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + free_page((unsigned long)tmp); + } +} + +/** + * kho_preserve_vmalloc - preserve memory allocated with vmalloc() across kexec + * @ptr: pointer to the area in vmalloc address space + * @preservation: placeholder for preservation metadata + * + * Instructs KHO to preserve the area in vmalloc address space at @ptr. The + * physical pages mapped at @ptr will be preserved and on successful return + * @preservation will hold the physical address of a structure that describes + * the preservation. + * + * NOTE: The memory allocated with vmalloc_node() variants cannot be reliably + * restored on the same node + * + * Return: 0 on success, error code on failure + */ +int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk; + struct vm_struct *vm = find_vm_area(ptr); + unsigned int order, flags, nr_contig_pages; + unsigned int idx = 0; + int err; + + if (!vm) + return -EINVAL; + + if (vm->flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) + return -EOPNOTSUPP; + + flags = vmalloc_flags_to_kho(vm->flags); + order = get_vm_area_page_order(vm); + + chunk = new_vmalloc_chunk(NULL); + if (!chunk) + return -ENOMEM; + KHOSER_STORE_PTR(preservation->first, chunk); + + nr_contig_pages = (1 << order); + for (int i = 0; i < vm->nr_pages; i += nr_contig_pages) { + phys_addr_t phys = page_to_phys(vm->pages[i]); + + err = kho_preserve_pages(vm->pages[i], nr_contig_pages); + if (err) + goto err_free; + + chunk->phys[idx++] = phys; + if (idx == ARRAY_SIZE(chunk->phys)) { + chunk = new_vmalloc_chunk(chunk); + if (!chunk) + goto err_free; + idx = 0; + } + } + + preservation->total_pages = vm->nr_pages; + preservation->flags = flags; + preservation->order = order; + + return 0; + +err_free: + kho_vmalloc_free_chunks(preservation); + return err; +} +EXPORT_SYMBOL_GPL(kho_preserve_vmalloc); + +/** + * kho_restore_vmalloc - recreates and populates an area in vmalloc address + * space from the preserved memory. + * @preservation: preservation metadata. + * + * Recreates an area in vmalloc address space and populates it with memory that + * was preserved using kho_preserve_vmalloc(). + * + * Return: pointer to the area in the vmalloc address space, NULL on failure. + */ +void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) +{ + struct kho_vmalloc_chunk *chunk = KHOSER_LOAD_PTR(preservation->first); + unsigned int align, order, shift, vm_flags; + unsigned long total_pages, contig_pages; + unsigned long addr, size; + struct vm_struct *area; + struct page **pages; + unsigned int idx = 0; + int err; + + vm_flags = kho_flags_to_vmalloc(preservation->flags); + if (vm_flags & ~KHO_VMALLOC_SUPPORTED_FLAGS) + return NULL; + + total_pages = preservation->total_pages; + pages = kvmalloc_array(total_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return NULL; + order = preservation->order; + contig_pages = (1 << order); + shift = PAGE_SHIFT + order; + align = 1 << shift; + + while (chunk) { + struct page *page; + + for (int i = 0; chunk->phys[i]; i++) { + phys_addr_t phys = chunk->phys[i]; + + if (idx + contig_pages > total_pages) + goto err_free_pages_array; + + page = kho_restore_pages(phys, contig_pages); + if (!page) + goto err_free_pages_array; + + for (int j = 0; j < contig_pages; j++) + pages[idx++] = page; + + phys += contig_pages * PAGE_SIZE; + } + + page = kho_restore_pages(virt_to_phys(chunk), 1); + if (!page) + goto err_free_pages_array; + chunk = KHOSER_LOAD_PTR(chunk->hdr.next); + __free_page(page); + } + + if (idx != total_pages) + goto err_free_pages_array; + + area = __get_vm_area_node(total_pages * PAGE_SIZE, align, shift, + vm_flags, VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); + if (!area) + goto err_free_pages_array; + + addr = (unsigned long)area->addr; + size = get_vm_area_size(area); + err = vmap_pages_range(addr, addr + size, PAGE_KERNEL, pages, shift); + if (err) + goto err_free_vm_area; + + area->nr_pages = total_pages; + area->pages = pages; + + return area->addr; + +err_free_vm_area: + free_vm_area(area); +err_free_pages_array: + kvfree(pages); + return NULL; +} +EXPORT_SYMBOL_GPL(kho_restore_vmalloc); + /* Handling for debug/kho/out */ static struct dentry *debugfs_root; -- cgit v1.2.3 From fcc0669c5aa681994c507b50f1c706c969d99730 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 22 Sep 2025 15:02:03 -0700 Subject: memcg: skip cgroup_file_notify if spinning is not allowed Generally memcg charging is allowed from all the contexts including NMI where even spinning on spinlock can cause locking issues. However one call chain was missed during the addition of memcg charging from any context support. That is try_charge_memcg() -> memcg_memory_event() -> cgroup_file_notify(). The possible function call tree under cgroup_file_notify() can acquire many different spin locks in spinning mode. Some of them are cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's just skip cgroup_file_notify() from memcg charging if the context does not allow spinning. Alternative approach was also explored where instead of skipping cgroup_file_notify(), we defer the memcg event processing to irq_work [1]. However it adds complexity and it was decided to keep things simple until we need more memcg events with !allow_spinning requirement. Link: https://lore.kernel.org/all/5qi2llyzf7gklncflo6gxoozljbm4h3tpnuv4u4ej4ztysvi6f@x44v7nz2wdzd/ [1] Link: https://lkml.kernel.org/r/20250922220203.261714-1-shakeel.butt@linux.dev Fixes: 3ac4638a734a ("memcg: make memcg_rstat_updated nmi safe") Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Closes: https://lore.kernel.org/all/20250905061919.439648-1-yepeilin@google.com/ Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Kumar Kartikeya Dwivedi Cc: Muchun Song Cc: Peilin Ye Cc: Roman Gushchin Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 26 +++++++++++++++++++------- mm/memcontrol.c | 7 ++++--- 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 16fe0306e50e..873e510d6f8d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1001,22 +1001,28 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, count_memcg_events_mm(mm, idx, 1); } -static inline void memcg_memory_event(struct mem_cgroup *memcg, - enum memcg_memory_event event) +static inline void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, + bool allow_spinning) { bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || event == MEMCG_SWAP_FAIL; + /* For now only MEMCG_MAX can happen with !allow_spinning context. */ + VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); + atomic_long_inc(&memcg->memory_events_local[event]); - if (!swap_event) + if (!swap_event && allow_spinning) cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); - if (swap_event) - cgroup_file_notify(&memcg->swap_events_file); - else - cgroup_file_notify(&memcg->events_file); + if (allow_spinning) { + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); + } if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; @@ -1026,6 +1032,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, !mem_cgroup_is_root(memcg)); } +static inline void memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + __memcg_memory_event(memcg, event, true); +} + static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e090f29eb03b..4deda33625f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2307,12 +2307,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, bool drained = false; bool raised_max_event = false; unsigned long pflags; + bool allow_spinning = gfpflags_allow_spinning(gfp_mask); retry: if (consume_stock(memcg, nr_pages)) return 0; - if (!gfpflags_allow_spinning(gfp_mask)) + if (!allow_spinning) /* Avoid the refill and flush of the older stock */ batch = nr_pages; @@ -2348,7 +2349,7 @@ retry: if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; - memcg_memory_event(mem_over_limit, MEMCG_MAX); + __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning); raised_max_event = true; psi_memstall_enter(&pflags); @@ -2415,7 +2416,7 @@ force: * a MEMCG_MAX event. */ if (!raised_max_event) - memcg_memory_event(mem_over_limit, MEMCG_MAX); + __memcg_memory_event(mem_over_limit, MEMCG_MAX, allow_spinning); /* * The allocation either can't fail or will lead to more memory -- cgit v1.2.3 From f04aad36a07cc17b7a5d5b9a2d386ce6fae63e93 Mon Sep 17 00:00:00 2001 From: Jakub Acs Date: Wed, 1 Oct 2025 09:03:52 +0000 Subject: mm/ksm: fix flag-dropping behavior in ksm_madvise syzkaller discovered the following crash: (kernel BUG) [ 44.607039] ------------[ cut here ]------------ [ 44.607422] kernel BUG at mm/userfaultfd.c:2067! [ 44.608148] Oops: invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN NOPTI [ 44.608814] CPU: 1 UID: 0 PID: 2475 Comm: reproducer Not tainted 6.16.0-rc6 #1 PREEMPT(none) [ 44.609635] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 44.610695] RIP: 0010:userfaultfd_release_all+0x3a8/0x460 [ 44.617726] Call Trace: [ 44.617926] [ 44.619284] userfaultfd_release+0xef/0x1b0 [ 44.620976] __fput+0x3f9/0xb60 [ 44.621240] fput_close_sync+0x110/0x210 [ 44.622222] __x64_sys_close+0x8f/0x120 [ 44.622530] do_syscall_64+0x5b/0x2f0 [ 44.622840] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 44.623244] RIP: 0033:0x7f365bb3f227 Kernel panics because it detects UFFD inconsistency during userfaultfd_release_all(). Specifically, a VMA which has a valid pointer to vma->vm_userfaultfd_ctx, but no UFFD flags in vma->vm_flags. The inconsistency is caused in ksm_madvise(): when user calls madvise() with MADV_UNMEARGEABLE on a VMA that is registered for UFFD in MINOR mode, it accidentally clears all flags stored in the upper 32 bits of vma->vm_flags. Assuming x86_64 kernel build, unsigned long is 64-bit and unsigned int and int are 32-bit wide. This setup causes the following mishap during the &= ~VM_MERGEABLE assignment. VM_MERGEABLE is a 32-bit constant of type unsigned int, 0x8000'0000. After ~ is applied, it becomes 0x7fff'ffff unsigned int, which is then promoted to unsigned long before the & operation. This promotion fills upper 32 bits with leading 0s, as we're doing unsigned conversion (and even for a signed conversion, this wouldn't help as the leading bit is 0). & operation thus ends up AND-ing vm_flags with 0x0000'0000'7fff'ffff instead of intended 0xffff'ffff'7fff'ffff and hence accidentally clears the upper 32-bits of its value. Fix it by changing `VM_MERGEABLE` constant to unsigned long, using the BIT() macro. Note: other VM_* flags are not affected: This only happens to the VM_MERGEABLE flag, as the other VM_* flags are all constants of type int and after ~ operation, they end up with leading 1 and are thus converted to unsigned long with leading 1s. Note 2: After commit 31defc3b01d9 ("userfaultfd: remove (VM_)BUG_ON()s"), this is no longer a kernel BUG, but a WARNING at the same place: [ 45.595973] WARNING: CPU: 1 PID: 2474 at mm/userfaultfd.c:2067 but the root-cause (flag-drop) remains the same. [akpm@linux-foundation.org: rust bindgen wasn't able to handle BIT(), from Miguel] Link: https://lore.kernel.org/oe-kbuild-all/202510030449.VfSaAjvd-lkp@intel.com/ Link: https://lkml.kernel.org/r/20251001090353.57523-2-acsjakub@amazon.de Fixes: 7677f7fd8be7 ("userfaultfd: add minor fault registration mode") Signed-off-by: Jakub Acs Signed-off-by: Miguel Ojeda Acked-by: David Hildenbrand Acked-by: SeongJae Park Tested-by: Alice Ryhl Tested-by: Miguel Ojeda Cc: Xu Xin Cc: Chengming Zhou Cc: Peter Xu Cc: Axel Rasmussen Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- rust/bindings/bindings_helper.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 06978b4dbeb8..70a2a76007d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -323,7 +323,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ -#define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */ #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 04b75d4d01c3..2e43c66635a2 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -108,6 +108,7 @@ const xa_mark_t RUST_CONST_HELPER_XA_PRESENT = XA_PRESENT; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC = XA_FLAGS_ALLOC; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC1 = XA_FLAGS_ALLOC1; +const vm_flags_t RUST_CONST_HELPER_VM_MERGEABLE = VM_MERGEABLE; #if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_RUST) #include "../../drivers/android/binder/rust_binder.h" -- cgit v1.2.3 From 27c0a7b05d13a0dc54ed0b95fc12218210fdea1a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 31 Jul 2025 12:02:27 -0700 Subject: libceph: Use HMAC-SHA256 library instead of crypto_shash Use the HMAC-SHA256 library functions instead of crypto_shash. This is simpler and faster. Signed-off-by: Eric Biggers Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 4 ++- net/ceph/Kconfig | 3 +- net/ceph/messenger_v2.c | 77 ++++++++++++------------------------------ 3 files changed, 26 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 1717cc57cdac..4b49592a738f 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -2,6 +2,7 @@ #ifndef __FS_CEPH_MESSENGER_H #define __FS_CEPH_MESSENGER_H +#include #include #include #include @@ -412,7 +413,8 @@ struct ceph_connection_v2_info { struct ceph_msg_data_cursor in_cursor; struct ceph_msg_data_cursor out_cursor; - struct crypto_shash *hmac_tfm; /* post-auth signature */ + struct hmac_sha256_key hmac_key; /* post-auth signature */ + bool hmac_key_set; struct crypto_aead *gcm_tfm; /* on-wire encryption */ struct aead_request *gcm_req; struct crypto_wait gcm_wait; diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index 0aa21fcbf6ec..ea60e3ef0834 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -6,8 +6,7 @@ config CEPH_LIB select CRYPTO_AES select CRYPTO_CBC select CRYPTO_GCM - select CRYPTO_HMAC - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select CRYPTO select KEYS default n diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 5483b4eed94e..c54c8b5a6526 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -709,7 +709,7 @@ static int setup_crypto(struct ceph_connection *con, dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n", __func__, con, con->v2.con_mode, session_key_len, con_secret_len); - WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req); + WARN_ON(con->v2.hmac_key_set || con->v2.gcm_tfm || con->v2.gcm_req); if (con->v2.con_mode != CEPH_CON_MODE_CRC && con->v2.con_mode != CEPH_CON_MODE_SECURE) { @@ -723,22 +723,8 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_none */ } - noio_flag = memalloc_noio_save(); - con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0); - memalloc_noio_restore(noio_flag); - if (IS_ERR(con->v2.hmac_tfm)) { - ret = PTR_ERR(con->v2.hmac_tfm); - con->v2.hmac_tfm = NULL; - pr_err("failed to allocate hmac tfm context: %d\n", ret); - return ret; - } - - ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key, - session_key_len); - if (ret) { - pr_err("failed to set hmac key: %d\n", ret); - return ret; - } + hmac_sha256_preparekey(&con->v2.hmac_key, session_key, session_key_len); + con->v2.hmac_key_set = true; if (con->v2.con_mode == CEPH_CON_MODE_CRC) { WARN_ON(con_secret_len); @@ -793,38 +779,26 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_x, secure mode */ } -static int ceph_hmac_sha256(struct ceph_connection *con, - const struct kvec *kvecs, int kvec_cnt, u8 *hmac) +static void ceph_hmac_sha256(struct ceph_connection *con, + const struct kvec *kvecs, int kvec_cnt, + u8 hmac[SHA256_DIGEST_SIZE]) { - SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */ - int ret; + struct hmac_sha256_ctx ctx; int i; - dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con, - con->v2.hmac_tfm, kvec_cnt); + dout("%s con %p hmac_key_set %d kvec_cnt %d\n", __func__, con, + con->v2.hmac_key_set, kvec_cnt); - if (!con->v2.hmac_tfm) { + if (!con->v2.hmac_key_set) { memset(hmac, 0, SHA256_DIGEST_SIZE); - return 0; /* auth_none */ + return; /* auth_none */ } - desc->tfm = con->v2.hmac_tfm; - ret = crypto_shash_init(desc); - if (ret) - goto out; - - for (i = 0; i < kvec_cnt; i++) { - ret = crypto_shash_update(desc, kvecs[i].iov_base, - kvecs[i].iov_len); - if (ret) - goto out; - } - - ret = crypto_shash_final(desc, hmac); - -out: - shash_desc_zero(desc); - return ret; /* auth_x, both plain and secure modes */ + /* auth_x, both plain and secure modes */ + hmac_sha256_init(&ctx, &con->v2.hmac_key); + for (i = 0; i < kvec_cnt; i++) + hmac_sha256_update(&ctx, kvecs[i].iov_base, kvecs[i].iov_len); + hmac_sha256_final(&ctx, hmac); } static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce) @@ -1455,17 +1429,14 @@ static int prepare_auth_request_more(struct ceph_connection *con, static int prepare_auth_signature(struct ceph_connection *con) { void *buf; - int ret; buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, con_secure(con))); if (!buf) return -ENOMEM; - ret = ceph_hmac_sha256(con, con->v2.in_sign_kvecs, - con->v2.in_sign_kvec_cnt, CTRL_BODY(buf)); - if (ret) - return ret; + ceph_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, + CTRL_BODY(buf)); return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf, SHA256_DIGEST_SIZE); @@ -2460,10 +2431,8 @@ static int process_auth_signature(struct ceph_connection *con, return -EINVAL; } - ret = ceph_hmac_sha256(con, con->v2.out_sign_kvecs, - con->v2.out_sign_kvec_cnt, hmac); - if (ret) - return ret; + ceph_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt, + hmac); ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad); if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) { @@ -3814,10 +3783,8 @@ void ceph_con_v2_reset_protocol(struct ceph_connection *con) memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN); memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN); - if (con->v2.hmac_tfm) { - crypto_free_shash(con->v2.hmac_tfm); - con->v2.hmac_tfm = NULL; - } + memzero_explicit(&con->v2.hmac_key, sizeof(con->v2.hmac_key)); + con->v2.hmac_key_set = false; if (con->v2.gcm_req) { aead_request_free(con->v2.gcm_req); con->v2.gcm_req = NULL; -- cgit v1.2.3 From 59699a5a7114f09f890e86c09a6b32afb5eaa64c Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 6 Aug 2025 11:48:53 +0200 Subject: libceph: make ceph_con_get_out_msg() return the message pointer The caller in messenger_v1.c loads it anyway, so let's keep the pointer in the register instead of reloading it from memory. This eliminates a tiny bit of unnecessary overhead. Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 2 +- net/ceph/messenger.c | 4 ++-- net/ceph/messenger_v1.c | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 4b49592a738f..9ebcac2981fd 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -550,7 +550,7 @@ void ceph_addr_set_port(struct ceph_entity_addr *addr, int p); void ceph_con_process_message(struct ceph_connection *con); int ceph_con_in_msg_alloc(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip); -void ceph_con_get_out_msg(struct ceph_connection *con); +struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con); /* messenger_v1.c */ int ceph_con_v1_try_read(struct ceph_connection *con); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9f6d860411cb..b6c7bfc03503 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2110,7 +2110,7 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con, return ret; } -void ceph_con_get_out_msg(struct ceph_connection *con) +struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con) { struct ceph_msg *msg; @@ -2141,7 +2141,7 @@ void ceph_con_get_out_msg(struct ceph_connection *con) * message or in case of a fault. */ WARN_ON(con->out_msg); - con->out_msg = ceph_msg_get(msg); + return con->out_msg = ceph_msg_get(msg); } /* diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 0cb61c76b9b8..eebe4e19d75a 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -210,8 +210,7 @@ static void prepare_write_message(struct ceph_connection *con) &con->v1.out_temp_ack); } - ceph_con_get_out_msg(con); - m = con->out_msg; + m = ceph_con_get_out_msg(con); dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), -- cgit v1.2.3 From 7399212dcf64d90a6ab239bdd98bd325d922fc7e Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Wed, 6 Aug 2025 11:48:54 +0200 Subject: libceph: pass the message pointer instead of loading con->out_msg This pointer is in a register anyway, so let's use that instead of reloading from memory everywhere. [ idryomov: formatting ] Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 4 +- net/ceph/messenger.c | 4 +- net/ceph/messenger_v1.c | 45 ++++++----- net/ceph/messenger_v2.c | 168 +++++++++++++++++++++-------------------- 4 files changed, 114 insertions(+), 107 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 9ebcac2981fd..6aa4c6478c9f 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -555,7 +555,7 @@ struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con); /* messenger_v1.c */ int ceph_con_v1_try_read(struct ceph_connection *con); int ceph_con_v1_try_write(struct ceph_connection *con); -void ceph_con_v1_revoke(struct ceph_connection *con); +void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg); void ceph_con_v1_revoke_incoming(struct ceph_connection *con); bool ceph_con_v1_opened(struct ceph_connection *con); void ceph_con_v1_reset_session(struct ceph_connection *con); @@ -564,7 +564,7 @@ void ceph_con_v1_reset_protocol(struct ceph_connection *con); /* messenger_v2.c */ int ceph_con_v2_try_read(struct ceph_connection *con); int ceph_con_v2_try_write(struct ceph_connection *con); -void ceph_con_v2_revoke(struct ceph_connection *con); +void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg); void ceph_con_v2_revoke_incoming(struct ceph_connection *con); bool ceph_con_v2_opened(struct ceph_connection *con); void ceph_con_v2_reset_session(struct ceph_connection *con); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b6c7bfc03503..08a6a083609f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1793,9 +1793,9 @@ void ceph_msg_revoke(struct ceph_msg *msg) WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was sending\n", __func__, con, msg); if (ceph_msgr2(from_msgr(con->msgr))) - ceph_con_v2_revoke(con); + ceph_con_v2_revoke(con, msg); else - ceph_con_v1_revoke(con); + ceph_con_v1_revoke(con, msg); ceph_msg_put(con->out_msg); con->out_msg = NULL; } else { diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index eebe4e19d75a..cc4a36ef8462 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -169,10 +169,9 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len) * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. */ -static void prepare_write_message_footer(struct ceph_connection *con) +static void prepare_write_message_footer(struct ceph_connection *con, + struct ceph_msg *m) { - struct ceph_msg *m = con->out_msg; - m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); @@ -230,31 +229,31 @@ static void prepare_write_message(struct ceph_connection *con) /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); - con->out_msg->hdr.crc = cpu_to_le32(crc); - memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr)); + m->hdr.crc = cpu_to_le32(crc); + memcpy(&con->v1.out_hdr, &m->hdr, sizeof(con->v1.out_hdr)); /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); - con->out_msg->footer.front_crc = cpu_to_le32(crc); + m->footer.front_crc = cpu_to_le32(crc); if (m->middle) { crc = crc32c(0, m->middle->vec.iov_base, m->middle->vec.iov_len); - con->out_msg->footer.middle_crc = cpu_to_le32(crc); + m->footer.middle_crc = cpu_to_le32(crc); } else - con->out_msg->footer.middle_crc = 0; + m->footer.middle_crc = 0; dout("%s front_crc %u middle_crc %u\n", __func__, - le32_to_cpu(con->out_msg->footer.front_crc), - le32_to_cpu(con->out_msg->footer.middle_crc)); - con->out_msg->footer.flags = 0; + le32_to_cpu(m->footer.front_crc), + le32_to_cpu(m->footer.middle_crc)); + m->footer.flags = 0; /* is there a data payload? */ - con->out_msg->footer.data_crc = 0; + m->footer.data_crc = 0; if (m->data_length) { - prepare_message_data(con->out_msg, m->data_length); + prepare_message_data(m, m->data_length); con->v1.out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ - prepare_write_message_footer(con); + prepare_write_message_footer(con, m); } ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); @@ -461,9 +460,9 @@ out: * 0 -> socket full, but more to do * <0 -> error */ -static int write_partial_message_data(struct ceph_connection *con) +static int write_partial_message_data(struct ceph_connection *con, + struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); u32 crc; @@ -515,7 +514,7 @@ static int write_partial_message_data(struct ceph_connection *con) else msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; con_out_kvec_reset(con); - prepare_write_message_footer(con); + prepare_write_message_footer(con, msg); return 1; /* must return > 0 to indicate success */ } @@ -1471,6 +1470,7 @@ bad_tag: */ int ceph_con_v1_try_write(struct ceph_connection *con) { + struct ceph_msg *msg; int ret = 1; dout("try_write start %p state %d\n", con, con->state); @@ -1517,14 +1517,15 @@ more: } /* msg pages? */ - if (con->out_msg) { + msg = con->out_msg; + if (msg) { if (con->v1.out_msg_done) { - ceph_msg_put(con->out_msg); + ceph_msg_put(msg); con->out_msg = NULL; /* we're done with this one */ goto do_next; } - ret = write_partial_message_data(con); + ret = write_partial_message_data(con, msg); if (ret == 1) goto more; /* we need to send the footer, too! */ if (ret == 0) @@ -1563,10 +1564,8 @@ out: return ret; } -void ceph_con_v1_revoke(struct ceph_connection *con) +void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; - WARN_ON(con->v1.out_skip); /* footer */ if (con->v1.out_msg_done) { diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c54c8b5a6526..b44e936f3865 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -1560,10 +1560,11 @@ static int prepare_ack(struct ceph_connection *con) return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8); } -static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) +static void prepare_epilogue_plain(struct ceph_connection *con, + struct ceph_msg *msg, bool aborted) { dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con, - con->out_msg, aborted, con->v2.out_epil.front_crc, + msg, aborted, con->v2.out_epil.front_crc, con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc); encode_epilogue_plain(con, aborted); @@ -1574,10 +1575,9 @@ static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) * For "used" empty segments, crc is -1. For unused (trailing) * segments, crc is 0. */ -static void prepare_message_plain(struct ceph_connection *con) +static void prepare_message_plain(struct ceph_connection *con, + struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; - prepare_head_plain(con, con->v2.out_buf, sizeof(struct ceph_msg_header2), NULL, 0, false); @@ -1618,7 +1618,7 @@ static void prepare_message_plain(struct ceph_connection *con) con->v2.out_state = OUT_S_QUEUE_DATA; } else { con->v2.out_epil.data_crc = 0; - prepare_epilogue_plain(con, false); + prepare_epilogue_plain(con, msg, false); con->v2.out_state = OUT_S_FINISH_MESSAGE; } } @@ -1630,7 +1630,8 @@ static void prepare_message_plain(struct ceph_connection *con) * allocate pages for the entire tail of the message (currently up * to ~32M) and two sgs arrays (up to ~256K each)... */ -static int prepare_message_secure(struct ceph_connection *con) +static int prepare_message_secure(struct ceph_connection *con, + struct ceph_msg *msg) { void *zerop = page_address(ceph_zero_page); struct sg_table enc_sgt = {}; @@ -1645,7 +1646,7 @@ static int prepare_message_secure(struct ceph_connection *con) if (ret) return ret; - tail_len = tail_onwire_len(con->out_msg, true); + tail_len = tail_onwire_len(msg, true); if (!tail_len) { /* * Empty message: once the head is written, @@ -1656,7 +1657,7 @@ static int prepare_message_secure(struct ceph_connection *con) } encode_epilogue_secure(con, false); - ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop, + ret = setup_message_sgs(&sgt, msg, zerop, zerop, zerop, &con->v2.out_epil, NULL, 0, false); if (ret) goto out; @@ -1685,7 +1686,7 @@ static int prepare_message_secure(struct ceph_connection *con) goto out; dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con, - con->out_msg, sgt.orig_nents, enc_page_cnt); + msg, sgt.orig_nents, enc_page_cnt); con->v2.out_state = OUT_S_QUEUE_ENC_PAGE; out: @@ -1694,19 +1695,19 @@ out: return ret; } -static int prepare_message(struct ceph_connection *con) +static int prepare_message(struct ceph_connection *con, struct ceph_msg *msg) { int lens[] = { sizeof(struct ceph_msg_header2), - front_len(con->out_msg), - middle_len(con->out_msg), - data_len(con->out_msg) + front_len(msg), + middle_len(msg), + data_len(msg) }; struct ceph_frame_desc desc; int ret; dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con, - con->out_msg, lens[0], lens[1], lens[2], lens[3]); + msg, lens[0], lens[1], lens[2], lens[3]); if (con->in_seq > con->in_seq_acked) { dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con, @@ -1717,15 +1718,15 @@ static int prepare_message(struct ceph_connection *con) reset_out_kvecs(con); init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4); encode_preamble(&desc, con->v2.out_buf); - fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr, + fill_header2(CTRL_BODY(con->v2.out_buf), &msg->hdr, con->in_seq_acked); if (con_secure(con)) { - ret = prepare_message_secure(con); + ret = prepare_message_secure(con, msg); if (ret) return ret; } else { - prepare_message_plain(con); + prepare_message_plain(con, msg); } ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); @@ -3153,20 +3154,20 @@ int ceph_con_v2_try_read(struct ceph_connection *con) } } -static void queue_data(struct ceph_connection *con) +static void queue_data(struct ceph_connection *con, struct ceph_msg *msg) { struct bio_vec bv; con->v2.out_epil.data_crc = -1; - ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg, - data_len(con->out_msg)); + ceph_msg_data_cursor_init(&con->v2.out_cursor, msg, + data_len(msg)); get_bvec_at(&con->v2.out_cursor, &bv); set_out_bvec(con, &bv, true); con->v2.out_state = OUT_S_QUEUE_DATA_CONT; } -static void queue_data_cont(struct ceph_connection *con) +static void queue_data_cont(struct ceph_connection *con, struct ceph_msg *msg) { struct bio_vec bv; @@ -3187,7 +3188,7 @@ static void queue_data_cont(struct ceph_connection *con) * we are done. */ reset_out_kvecs(con); - prepare_epilogue_plain(con, false); + prepare_epilogue_plain(con, msg, false); con->v2.out_state = OUT_S_FINISH_MESSAGE; } @@ -3219,7 +3220,7 @@ static void queue_enc_page(struct ceph_connection *con) con->v2.out_state = OUT_S_FINISH_MESSAGE; } -static void queue_zeros(struct ceph_connection *con) +static void queue_zeros(struct ceph_connection *con, struct ceph_msg *msg) { dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero); @@ -3236,7 +3237,7 @@ static void queue_zeros(struct ceph_connection *con) * Once it's written, we are done patching up for the revoke. */ reset_out_kvecs(con); - prepare_epilogue_plain(con, true); + prepare_epilogue_plain(con, msg, true); con->v2.out_state = OUT_S_FINISH_MESSAGE; } @@ -3263,6 +3264,7 @@ static void finish_message(struct ceph_connection *con) static int populate_out_iter(struct ceph_connection *con) { + struct ceph_msg *msg; int ret; dout("%s con %p state %d out_state %d\n", __func__, con, con->state, @@ -3278,18 +3280,18 @@ static int populate_out_iter(struct ceph_connection *con) switch (con->v2.out_state) { case OUT_S_QUEUE_DATA: WARN_ON(!con->out_msg); - queue_data(con); + queue_data(con, con->out_msg); goto populated; case OUT_S_QUEUE_DATA_CONT: WARN_ON(!con->out_msg); - queue_data_cont(con); + queue_data_cont(con, con->out_msg); goto populated; case OUT_S_QUEUE_ENC_PAGE: queue_enc_page(con); goto populated; case OUT_S_QUEUE_ZEROS: WARN_ON(con->out_msg); /* revoked */ - queue_zeros(con); + queue_zeros(con, con->out_msg); goto populated; case OUT_S_FINISH_MESSAGE: finish_message(con); @@ -3309,8 +3311,8 @@ static int populate_out_iter(struct ceph_connection *con) return ret; } } else if (!list_empty(&con->out_queue)) { - ceph_con_get_out_msg(con); - ret = prepare_message(con); + msg = ceph_con_get_out_msg(con); + ret = prepare_message(con, msg); if (ret) { pr_err("prepare_message failed: %d\n", ret); return ret; @@ -3422,17 +3424,18 @@ static u32 crc32c_zeros(u32 crc, int zero_len) return crc; } -static void prepare_zero_front(struct ceph_connection *con, int resid) +static void prepare_zero_front(struct ceph_connection *con, + struct ceph_msg *msg, int resid) { int sent; - WARN_ON(!resid || resid > front_len(con->out_msg)); - sent = front_len(con->out_msg) - resid; + WARN_ON(!resid || resid > front_len(msg)); + sent = front_len(msg) - resid; dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); if (sent) { con->v2.out_epil.front_crc = - crc32c(-1, con->out_msg->front.iov_base, sent); + crc32c(-1, msg->front.iov_base, sent); con->v2.out_epil.front_crc = crc32c_zeros(con->v2.out_epil.front_crc, resid); } else { @@ -3443,17 +3446,18 @@ static void prepare_zero_front(struct ceph_connection *con, int resid) out_zero_add(con, resid); } -static void prepare_zero_middle(struct ceph_connection *con, int resid) +static void prepare_zero_middle(struct ceph_connection *con, + struct ceph_msg *msg, int resid) { int sent; - WARN_ON(!resid || resid > middle_len(con->out_msg)); - sent = middle_len(con->out_msg) - resid; + WARN_ON(!resid || resid > middle_len(msg)); + sent = middle_len(msg) - resid; dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); if (sent) { con->v2.out_epil.middle_crc = - crc32c(-1, con->out_msg->middle->vec.iov_base, sent); + crc32c(-1, msg->middle->vec.iov_base, sent); con->v2.out_epil.middle_crc = crc32c_zeros(con->v2.out_epil.middle_crc, resid); } else { @@ -3464,61 +3468,64 @@ static void prepare_zero_middle(struct ceph_connection *con, int resid) out_zero_add(con, resid); } -static void prepare_zero_data(struct ceph_connection *con) +static void prepare_zero_data(struct ceph_connection *con, + struct ceph_msg *msg) { dout("%s con %p\n", __func__, con); - con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg)); - out_zero_add(con, data_len(con->out_msg)); + con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(msg)); + out_zero_add(con, data_len(msg)); } -static void revoke_at_queue_data(struct ceph_connection *con) +static void revoke_at_queue_data(struct ceph_connection *con, + struct ceph_msg *msg) { int boundary; int resid; - WARN_ON(!data_len(con->out_msg)); + WARN_ON(!data_len(msg)); WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); - boundary = front_len(con->out_msg) + middle_len(con->out_msg); + boundary = front_len(msg) + middle_len(msg); if (resid > boundary) { resid -= boundary; WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head\n", __func__, con); - if (front_len(con->out_msg)) - prepare_zero_front(con, front_len(con->out_msg)); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); - prepare_zero_data(con); + if (front_len(msg)) + prepare_zero_front(con, msg, front_len(msg)); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); + prepare_zero_data(con, msg); WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); con->v2.out_state = OUT_S_QUEUE_ZEROS; return; } - boundary = middle_len(con->out_msg); + boundary = middle_len(msg); if (resid > boundary) { resid -= boundary; dout("%s con %p was sending front\n", __func__, con); - prepare_zero_front(con, resid); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); - prepare_zero_data(con); - queue_zeros(con); + prepare_zero_front(con, msg, resid); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); + prepare_zero_data(con, msg); + queue_zeros(con, msg); return; } WARN_ON(!resid); dout("%s con %p was sending middle\n", __func__, con); - prepare_zero_middle(con, resid); - prepare_zero_data(con); - queue_zeros(con); + prepare_zero_middle(con, msg, resid); + prepare_zero_data(con, msg); + queue_zeros(con, msg); } -static void revoke_at_queue_data_cont(struct ceph_connection *con) +static void revoke_at_queue_data_cont(struct ceph_connection *con, + struct ceph_msg *msg) { int sent, resid; /* current piece of data */ - WARN_ON(!data_len(con->out_msg)); + WARN_ON(!data_len(msg)); WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); WARN_ON(!resid || resid > con->v2.out_bvec.bv_len); @@ -3537,10 +3544,11 @@ static void revoke_at_queue_data_cont(struct ceph_connection *con) con->v2.out_iter.count -= resid; out_zero_add(con, con->v2.out_cursor.total_resid); - queue_zeros(con); + queue_zeros(con, msg); } -static void revoke_at_finish_message(struct ceph_connection *con) +static void revoke_at_finish_message(struct ceph_connection *con, + struct ceph_msg *msg) { int boundary; int resid; @@ -3548,39 +3556,39 @@ static void revoke_at_finish_message(struct ceph_connection *con) WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); - if (!front_len(con->out_msg) && !middle_len(con->out_msg) && - !data_len(con->out_msg)) { + if (!front_len(msg) && !middle_len(msg) && + !data_len(msg)) { WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head (empty message) - noop\n", __func__, con); return; } - boundary = front_len(con->out_msg) + middle_len(con->out_msg) + + boundary = front_len(msg) + middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN; if (resid > boundary) { resid -= boundary; WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head\n", __func__, con); - if (front_len(con->out_msg)) - prepare_zero_front(con, front_len(con->out_msg)); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); + if (front_len(msg)) + prepare_zero_front(con, msg, front_len(msg)); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); con->v2.out_state = OUT_S_QUEUE_ZEROS; return; } - boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN; + boundary = middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN; if (resid > boundary) { resid -= boundary; dout("%s con %p was sending front\n", __func__, con); - prepare_zero_front(con, resid); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); + prepare_zero_front(con, msg, resid); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; - queue_zeros(con); + queue_zeros(con, msg); return; } @@ -3588,9 +3596,9 @@ static void revoke_at_finish_message(struct ceph_connection *con) if (resid > boundary) { resid -= boundary; dout("%s con %p was sending middle\n", __func__, con); - prepare_zero_middle(con, resid); + prepare_zero_middle(con, msg, resid); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; - queue_zeros(con); + queue_zeros(con, msg); return; } @@ -3598,7 +3606,7 @@ static void revoke_at_finish_message(struct ceph_connection *con) dout("%s con %p was sending epilogue - noop\n", __func__, con); } -void ceph_con_v2_revoke(struct ceph_connection *con) +void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg) { WARN_ON(con->v2.out_zero); @@ -3611,13 +3619,13 @@ void ceph_con_v2_revoke(struct ceph_connection *con) switch (con->v2.out_state) { case OUT_S_QUEUE_DATA: - revoke_at_queue_data(con); + revoke_at_queue_data(con, msg); break; case OUT_S_QUEUE_DATA_CONT: - revoke_at_queue_data_cont(con); + revoke_at_queue_data_cont(con, msg); break; case OUT_S_FINISH_MESSAGE: - revoke_at_finish_message(con); + revoke_at_finish_message(con, msg); break; default: WARN(1, "bad out_state %d", con->v2.out_state); -- cgit v1.2.3 From 207696b17f38e869e59889b44d395ab24bb678d3 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Thu, 18 Sep 2025 22:30:18 +0300 Subject: tpm: use a map for tpm2_calc_ordinal_duration() The current shenanigans for duration calculation introduce too much complexity for a trivial problem, and further the code is hard to patch and maintain. Address these issues with a flat look-up table, which is easy to understand and patch. If leaf driver specific patching is required in future, it is easy enough to make a copy of this table during driver initialization and add the chip parameter back. 'chip->duration' is retained for TPM 1.x. As the first entry for this new behavior address TCG spec update mentioned in this issue: https://github.com/raspberrypi/linux/issues/7054 Therefore, for TPM_SelfTest the duration is set to 3000 ms. This does not categorize a as bug, given that this is introduced to the spec after the feature was originally made. Reviewed-by: Serge Hallyn Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-interface.c | 2 +- drivers/char/tpm/tpm.h | 2 +- drivers/char/tpm/tpm2-cmd.c | 127 ++++++++++----------------------------- include/linux/tpm.h | 5 +- 4 files changed, 37 insertions(+), 99 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index b71725827743..c9f173001d0e 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -52,7 +52,7 @@ MODULE_PARM_DESC(suspend_pcr, unsigned long tpm_calc_ordinal_duration(struct tpm_chip *chip, u32 ordinal) { if (chip->flags & TPM_CHIP_FLAG_TPM2) - return tpm2_calc_ordinal_duration(chip, ordinal); + return tpm2_calc_ordinal_duration(ordinal); else return tpm1_calc_ordinal_duration(chip, ordinal); } diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 7bb87fa5f7a1..2726bd38e5ac 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -299,7 +299,7 @@ ssize_t tpm2_get_tpm_pt(struct tpm_chip *chip, u32 property_id, ssize_t tpm2_get_pcr_allocation(struct tpm_chip *chip); int tpm2_auto_startup(struct tpm_chip *chip); void tpm2_shutdown(struct tpm_chip *chip, u16 shutdown_type); -unsigned long tpm2_calc_ordinal_duration(struct tpm_chip *chip, u32 ordinal); +unsigned long tpm2_calc_ordinal_duration(u32 ordinal); int tpm2_probe(struct tpm_chip *chip); int tpm2_get_cc_attrs_tbl(struct tpm_chip *chip); int tpm2_find_cc(struct tpm_chip *chip, u32 cc); diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 524d802ede26..7d77f6fbc152 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -28,120 +28,57 @@ static struct tpm2_hash tpm2_hash_map[] = { int tpm2_get_timeouts(struct tpm_chip *chip) { - /* Fixed timeouts for TPM2 */ chip->timeout_a = msecs_to_jiffies(TPM2_TIMEOUT_A); chip->timeout_b = msecs_to_jiffies(TPM2_TIMEOUT_B); chip->timeout_c = msecs_to_jiffies(TPM2_TIMEOUT_C); chip->timeout_d = msecs_to_jiffies(TPM2_TIMEOUT_D); - - /* PTP spec timeouts */ - chip->duration[TPM_SHORT] = msecs_to_jiffies(TPM2_DURATION_SHORT); - chip->duration[TPM_MEDIUM] = msecs_to_jiffies(TPM2_DURATION_MEDIUM); - chip->duration[TPM_LONG] = msecs_to_jiffies(TPM2_DURATION_LONG); - - /* Key creation commands long timeouts */ - chip->duration[TPM_LONG_LONG] = - msecs_to_jiffies(TPM2_DURATION_LONG_LONG); - chip->flags |= TPM_CHIP_FLAG_HAVE_TIMEOUTS; - return 0; } -/** - * tpm2_ordinal_duration_index() - returns an index to the chip duration table - * @ordinal: TPM command ordinal. - * - * The function returns an index to the chip duration table - * (enum tpm_duration), that describes the maximum amount of - * time the chip could take to return the result for a particular ordinal. - * - * The values of the MEDIUM, and LONG durations are taken - * from the PC Client Profile (PTP) specification (750, 2000 msec) - * - * LONG_LONG is for commands that generates keys which empirically takes - * a longer time on some systems. - * - * Return: - * * TPM_MEDIUM - * * TPM_LONG - * * TPM_LONG_LONG - * * TPM_UNDEFINED +/* + * Contains the maximum durations in milliseconds for TPM2 commands. */ -static u8 tpm2_ordinal_duration_index(u32 ordinal) -{ - switch (ordinal) { - /* Startup */ - case TPM2_CC_STARTUP: /* 144 */ - return TPM_MEDIUM; - - case TPM2_CC_SELF_TEST: /* 143 */ - return TPM_LONG; - - case TPM2_CC_GET_RANDOM: /* 17B */ - return TPM_LONG; - - case TPM2_CC_SEQUENCE_UPDATE: /* 15C */ - return TPM_MEDIUM; - case TPM2_CC_SEQUENCE_COMPLETE: /* 13E */ - return TPM_MEDIUM; - case TPM2_CC_EVENT_SEQUENCE_COMPLETE: /* 185 */ - return TPM_MEDIUM; - case TPM2_CC_HASH_SEQUENCE_START: /* 186 */ - return TPM_MEDIUM; - - case TPM2_CC_VERIFY_SIGNATURE: /* 177 */ - return TPM_LONG_LONG; - - case TPM2_CC_PCR_EXTEND: /* 182 */ - return TPM_MEDIUM; - - case TPM2_CC_HIERARCHY_CONTROL: /* 121 */ - return TPM_LONG; - case TPM2_CC_HIERARCHY_CHANGE_AUTH: /* 129 */ - return TPM_LONG; - - case TPM2_CC_GET_CAPABILITY: /* 17A */ - return TPM_MEDIUM; - - case TPM2_CC_NV_READ: /* 14E */ - return TPM_LONG; - - case TPM2_CC_CREATE_PRIMARY: /* 131 */ - return TPM_LONG_LONG; - case TPM2_CC_CREATE: /* 153 */ - return TPM_LONG_LONG; - case TPM2_CC_CREATE_LOADED: /* 191 */ - return TPM_LONG_LONG; - - default: - return TPM_UNDEFINED; - } -} +static const struct { + unsigned long ordinal; + unsigned long duration; +} tpm2_ordinal_duration_map[] = { + {TPM2_CC_STARTUP, 750}, + {TPM2_CC_SELF_TEST, 3000}, + {TPM2_CC_GET_RANDOM, 2000}, + {TPM2_CC_SEQUENCE_UPDATE, 750}, + {TPM2_CC_SEQUENCE_COMPLETE, 750}, + {TPM2_CC_EVENT_SEQUENCE_COMPLETE, 750}, + {TPM2_CC_HASH_SEQUENCE_START, 750}, + {TPM2_CC_VERIFY_SIGNATURE, 30000}, + {TPM2_CC_PCR_EXTEND, 750}, + {TPM2_CC_HIERARCHY_CONTROL, 2000}, + {TPM2_CC_HIERARCHY_CHANGE_AUTH, 2000}, + {TPM2_CC_GET_CAPABILITY, 750}, + {TPM2_CC_NV_READ, 2000}, + {TPM2_CC_CREATE_PRIMARY, 30000}, + {TPM2_CC_CREATE, 30000}, + {TPM2_CC_CREATE_LOADED, 30000}, +}; /** - * tpm2_calc_ordinal_duration() - calculate the maximum command duration - * @chip: TPM chip to use. + * tpm2_calc_ordinal_duration() - Calculate the maximum command duration * @ordinal: TPM command ordinal. * - * The function returns the maximum amount of time the chip could take - * to return the result for a particular ordinal in jiffies. - * - * Return: A maximal duration time for an ordinal in jiffies. + * Returns the maximum amount of time the chip is expected by kernel to + * take in jiffies. */ -unsigned long tpm2_calc_ordinal_duration(struct tpm_chip *chip, u32 ordinal) +unsigned long tpm2_calc_ordinal_duration(u32 ordinal) { - unsigned int index; + int i; - index = tpm2_ordinal_duration_index(ordinal); + for (i = 0; i < ARRAY_SIZE(tpm2_ordinal_duration_map); i++) + if (ordinal == tpm2_ordinal_duration_map[i].ordinal) + return msecs_to_jiffies(tpm2_ordinal_duration_map[i].duration); - if (index != TPM_UNDEFINED) - return chip->duration[index]; - else - return msecs_to_jiffies(TPM2_DURATION_DEFAULT); + return msecs_to_jiffies(TPM2_DURATION_DEFAULT); } - struct tpm2_pcr_read_out { __be32 update_cnt; __be32 pcr_selects_cnt; diff --git a/include/linux/tpm.h b/include/linux/tpm.h index b0e9eb5ef022..dc0338a783f3 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -228,10 +228,11 @@ enum tpm2_timeouts { TPM2_TIMEOUT_B = 4000, TPM2_TIMEOUT_C = 200, TPM2_TIMEOUT_D = 30, +}; + +enum tpm2_durations { TPM2_DURATION_SHORT = 20, - TPM2_DURATION_MEDIUM = 750, TPM2_DURATION_LONG = 2000, - TPM2_DURATION_LONG_LONG = 300000, TPM2_DURATION_DEFAULT = 120000, }; -- cgit v1.2.3 From a8482d2c9071d75c920eba0db36428898250ea57 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 11 Oct 2025 12:31:53 +0200 Subject: Revert "i2c: boardinfo: Annotate code used in init phase only" This reverts commit 1a2b423be6a89dd07d5fc27ea042be68697a6a49 because we got a regression report and need time to find out the details. Reported-by: Konrad Dybcio Closes: https://lore.kernel.org/r/29ec0082-4dd4-4120-acd2-44b35b4b9487@oss.qualcomm.com Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-boardinfo.c | 4 ++-- include/linux/i2c.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-boardinfo.c b/drivers/i2c/i2c-boardinfo.c index 338800321f8b..4df8ad092df3 100644 --- a/drivers/i2c/i2c-boardinfo.c +++ b/drivers/i2c/i2c-boardinfo.c @@ -22,7 +22,7 @@ EXPORT_SYMBOL_GPL(__i2c_board_lock); LIST_HEAD(__i2c_board_list); EXPORT_SYMBOL_GPL(__i2c_board_list); -int __i2c_first_dynamic_bus_num __ro_after_init; +int __i2c_first_dynamic_bus_num; EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); @@ -48,7 +48,7 @@ EXPORT_SYMBOL_GPL(__i2c_first_dynamic_bus_num); * The board info passed can safely be __initdata, but be careful of embedded * pointers (for platform_data, functions, etc) since that won't be copied. */ -int __init i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) +int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned len) { int status; diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 11a19241e360..20fd41b51d5c 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -499,7 +499,7 @@ static inline struct i2c_client *i2c_verify_client(struct device *dev) * Modules for add-on boards must use other calls. */ #ifdef CONFIG_I2C_BOARDINFO -int __init +int i2c_register_board_info(int busnum, struct i2c_board_info const *info, unsigned n); #else -- cgit v1.2.3